ncoop57
		
	commited on
		
		
					Commit 
							
							·
						
						021b099
	
1
								Parent(s):
							
							f400687
								
add initial code
Browse files- app.py +87 -0
- clip.py +80 -0
- requirements.txt +6 -0
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,87 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from torch._C import device
         | 
| 2 | 
            +
            import ffmpeg
         | 
| 3 | 
            +
            import youtube_dl
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from PIL import Image
         | 
| 8 | 
            +
            import requests
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            import torch
         | 
| 11 | 
            +
            from sentence_transformers import SentenceTransformer, util, models
         | 
| 12 | 
            +
            from clip import CLIPModel
         | 
| 13 | 
            +
            # from sentence_transformers.models import CLIPModel
         | 
| 14 | 
            +
            from PIL import Image
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            clip = CLIPModel()
         | 
| 17 | 
            +
            model = SentenceTransformer(modules=[clip]).to(dtype=torch.float32, device=torch.device('cpu'))
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            def get_embedding(query, video):
         | 
| 21 | 
            +
                text_emb = model.encode(query, device='cpu')
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                # Encode an image:
         | 
| 24 | 
            +
                images = []
         | 
| 25 | 
            +
                for img in video:
         | 
| 26 | 
            +
                    images.append(Image.fromarray(img))
         | 
| 27 | 
            +
                img_embs = model.encode(images, device='cpu')
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                return text_emb, img_embs
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            # # Encode an image:
         | 
| 33 | 
            +
            # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         | 
| 34 | 
            +
            # img = Image.fromarray(np.array(Image.open(requests.get(url, stream=True).raw))).convert('RGB')
         | 
| 35 | 
            +
            # img_emb = model.encode([img, img], device='cpu')
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            # # Encode text descriptions
         | 
| 38 | 
            +
            # text_emb = model.encode(['Two dogs in the snow', 'Two cats laying on a sofa',
         | 
| 39 | 
            +
            #                          'A picture of London at night'], device='cpu')
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            # # Compute cosine similarities
         | 
| 42 | 
            +
            # cos_scores = util.cos_sim(img_emb, text_emb)
         | 
| 43 | 
            +
            # print(cos_scores)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
             | 
| 46 | 
            +
            def my_hook(d):
         | 
| 47 | 
            +
                if d['status'] == 'finished':
         | 
| 48 | 
            +
                    print(d)
         | 
| 49 | 
            +
                    print('Done downloading, now extracting frames ...')
         | 
| 50 | 
            +
                    probe = ffmpeg.probe(d["filename"])
         | 
| 51 | 
            +
                    video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
         | 
| 52 | 
            +
                    width = int(video_stream['width'])
         | 
| 53 | 
            +
                    height = int(video_stream['height'])
         | 
| 54 | 
            +
                    out, _ = (
         | 
| 55 | 
            +
                        ffmpeg
         | 
| 56 | 
            +
                        .input(d["filename"])
         | 
| 57 | 
            +
                        .output('pipe:', format='rawvideo', pix_fmt='rgb24')
         | 
| 58 | 
            +
                        .run(capture_stdout=True)
         | 
| 59 | 
            +
                    )
         | 
| 60 | 
            +
                    video = (
         | 
| 61 | 
            +
                        np
         | 
| 62 | 
            +
                        .frombuffer(out, np.uint8)
         | 
| 63 | 
            +
                        .reshape([-1, height, width, 3])
         | 
| 64 | 
            +
                    )[::10]
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    print(video.shape)
         | 
| 67 | 
            +
                    txt_embd, img_embds = get_embedding("two white puppies", video)
         | 
| 68 | 
            +
                    cos_scores = util.cos_sim(txt_embd, img_embds)
         | 
| 69 | 
            +
                    print(cos_scores)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
| 72 | 
            +
            ydl_opts = {"format": "mp4", "progress_hooks": [my_hook], }
         | 
| 73 | 
            +
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         | 
| 74 | 
            +
                ydl.download(['https://youtu.be/I3AaW9ZevIU'])
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            # # out, _ = (
         | 
| 78 | 
            +
            # #     ffmpeg
         | 
| 79 | 
            +
            # #     .input('in.mp4')
         | 
| 80 | 
            +
            # #     .output('pipe:', format='rawvideo', pix_fmt='rgb24')
         | 
| 81 | 
            +
            # #     .run(capture_stdout=True)
         | 
| 82 | 
            +
            # # )
         | 
| 83 | 
            +
            # # video = (
         | 
| 84 | 
            +
            # #     np
         | 
| 85 | 
            +
            # #     .frombuffer(out, np.uint8)
         | 
| 86 | 
            +
            # #     .reshape([-1, height, width, 3])
         | 
| 87 | 
            +
            # )
         | 
    	
        clip.py
    ADDED
    
    | @@ -0,0 +1,80 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from torch import nn
         | 
| 2 | 
            +
            import transformers
         | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            from PIL import Image
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            class CLIPModel(nn.Module):
         | 
| 8 | 
            +
                def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_name=None):
         | 
| 9 | 
            +
                    super(CLIPModel, self).__init__()
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                    if processor_name is None:
         | 
| 12 | 
            +
                        processor_name = model_name
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                    self.model = transformers.CLIPModel.from_pretrained(model_name)
         | 
| 15 | 
            +
                    self.processor = transformers.CLIPProcessor.from_pretrained(processor_name)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                def __repr__(self):
         | 
| 18 | 
            +
                    return "CLIPModel()"
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def forward(self, features):
         | 
| 21 | 
            +
                    image_embeds = []
         | 
| 22 | 
            +
                    text_embeds = []
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    if 'pixel_values' in features:
         | 
| 25 | 
            +
                        vision_outputs = self.model.vision_model(pixel_values=features['pixel_values'])
         | 
| 26 | 
            +
                        image_embeds = self.model.visual_projection(vision_outputs[1])
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    if 'input_ids' in features:
         | 
| 29 | 
            +
                        text_outputs = self.model.text_model(
         | 
| 30 | 
            +
                            input_ids=features.get('input_ids'),
         | 
| 31 | 
            +
                            attention_mask=features.get('attention_mask', None),
         | 
| 32 | 
            +
                            position_ids=features.get('position_ids', None),
         | 
| 33 | 
            +
                            output_attentions=features.get('output_attentions', None),
         | 
| 34 | 
            +
                            output_hidden_states=features.get('output_hidden_states', None),
         | 
| 35 | 
            +
                        )
         | 
| 36 | 
            +
                        text_embeds = self.model.text_projection(text_outputs[1])
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    sentence_embedding = []
         | 
| 39 | 
            +
                    image_features = iter(image_embeds)
         | 
| 40 | 
            +
                    text_features = iter(text_embeds)
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    for idx, input_type in enumerate(features['image_text_info']):
         | 
| 43 | 
            +
                        if input_type == 0:
         | 
| 44 | 
            +
                            sentence_embedding.append(next(image_features))
         | 
| 45 | 
            +
                        else:
         | 
| 46 | 
            +
                            sentence_embedding.append(next(text_features))
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    features['sentence_embedding'] = torch.stack(sentence_embedding).float()
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    return features
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                def tokenize(self, texts):
         | 
| 53 | 
            +
                    images = []
         | 
| 54 | 
            +
                    texts_values = []
         | 
| 55 | 
            +
                    image_text_info = []
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                    for idx, data in enumerate(texts):
         | 
| 58 | 
            +
                        if isinstance(data, Image.Image):  # An Image
         | 
| 59 | 
            +
                            images.append(data)
         | 
| 60 | 
            +
                            image_text_info.append(0)
         | 
| 61 | 
            +
                        else:  # A text
         | 
| 62 | 
            +
                            texts_values.append(data)
         | 
| 63 | 
            +
                            image_text_info.append(1)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                    if len(texts_values) == 0:
         | 
| 66 | 
            +
                        texts_values = None
         | 
| 67 | 
            +
                    if len(images) == 0:
         | 
| 68 | 
            +
                        images = None
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
         | 
| 71 | 
            +
                    inputs['image_text_info'] = image_text_info
         | 
| 72 | 
            +
                    return inputs
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                def save(self, output_path: str):
         | 
| 75 | 
            +
                    self.model.save_pretrained(output_path)
         | 
| 76 | 
            +
                    self.processor.save_pretrained(output_path)
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                @staticmethod
         | 
| 79 | 
            +
                def load(input_path: str):
         | 
| 80 | 
            +
                    return CLIPModel(model_name=input_path)
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ffmpeg-python
         | 
| 2 | 
            +
            numpy
         | 
| 3 | 
            +
            pillow
         | 
| 4 | 
            +
            torch
         | 
| 5 | 
            +
            git+https://github.com/ncoop57/sentence-transformers@clip-image-check
         | 
| 6 | 
            +
            youtube_dl
         | 
