add_grounding_example (#3)
Browse files- add grounding example (1bf75f641b77968fc823f0bc5a70647d4f8b0b08)
- update readme example (d1f32dbe929884905eafd0a46fe45c48a3380b93)
Co-authored-by: Haiping Wu <[email protected]>
    	
        README.md
    CHANGED
    
    | @@ -85,18 +85,21 @@ processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large-ft", trust | |
| 85 | 
             
            url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
         | 
| 86 | 
             
            image = Image.open(requests.get(url, stream=True).raw)
         | 
| 87 |  | 
| 88 | 
            -
            def run_example( | 
| 89 | 
            -
             | 
|  | |
|  | |
|  | |
| 90 | 
             
                inputs = processor(text=prompt, images=image, return_tensors="pt")
         | 
| 91 | 
             
                generated_ids = model.generate(
         | 
| 92 | 
             
                  input_ids=inputs["input_ids"],
         | 
| 93 | 
             
                  pixel_values=inputs["pixel_values"],
         | 
| 94 | 
             
                  max_new_tokens=1024,
         | 
| 95 | 
            -
                  num_beams=3 | 
| 96 | 
             
                )
         | 
| 97 | 
             
                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
         | 
| 98 |  | 
| 99 | 
            -
                parsed_answer = processor.post_process_generation(generated_text, task= | 
| 100 |  | 
| 101 | 
             
                print(parsed_answer)
         | 
| 102 | 
             
            ```
         | 
| @@ -110,7 +113,7 @@ Here are the tasks `Florence-2` could perform: | |
| 110 | 
             
            ### OCR 
         | 
| 111 |  | 
| 112 | 
             
            ```python
         | 
| 113 | 
            -
            prompt = <OCR>
         | 
| 114 | 
             
            run_example(prompt)
         | 
| 115 | 
             
            ```
         | 
| 116 |  | 
| @@ -118,25 +121,25 @@ run_example(prompt) | |
| 118 | 
             
            OCR with region output format:
         | 
| 119 | 
             
            {'\<OCR_WITH_REGION>': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}
         | 
| 120 | 
             
            ```python
         | 
| 121 | 
            -
            prompt = <OCR_WITH_REGION>
         | 
| 122 | 
             
            run_example(prompt)
         | 
| 123 | 
             
            ```
         | 
| 124 |  | 
| 125 | 
             
            ### Caption
         | 
| 126 | 
             
            ```python
         | 
| 127 | 
            -
            prompt = <CAPTION>
         | 
| 128 | 
             
            run_example(prompt)
         | 
| 129 | 
             
            ```
         | 
| 130 |  | 
| 131 | 
             
            ### Detailed Caption
         | 
| 132 | 
             
            ```python
         | 
| 133 | 
            -
            prompt = <DETAILED_CAPTION>
         | 
| 134 | 
             
            run_example(prompt)
         | 
| 135 | 
             
            ```
         | 
| 136 |  | 
| 137 | 
             
            ### More Detailed Caption
         | 
| 138 | 
             
            ```python
         | 
| 139 | 
            -
            prompt = <MORE_DETAILED_CAPTION>
         | 
| 140 | 
             
            run_example(prompt)
         | 
| 141 | 
             
            ```
         | 
| 142 |  | 
| @@ -147,7 +150,7 @@ OD results format: | |
| 147 | 
             
            'labels': ['label1', 'label2', ...]} }
         | 
| 148 |  | 
| 149 | 
             
            ```python
         | 
| 150 | 
            -
            prompt = <OD>
         | 
| 151 | 
             
            run_example(prompt)
         | 
| 152 | 
             
            ```
         | 
| 153 |  | 
| @@ -156,7 +159,7 @@ Dense region caption results format: | |
| 156 | 
             
            {'\<DENSE_REGION_CAPTION>' : {'bboxes': [[x1, y1, x2, y2], ...], 
         | 
| 157 | 
             
            'labels': ['label1', 'label2', ...]} }
         | 
| 158 | 
             
            ```python
         | 
| 159 | 
            -
            prompt = <DENSE_REGION_CAPTION>
         | 
| 160 | 
             
            run_example(prompt)
         | 
| 161 | 
             
            ```
         | 
| 162 |  | 
| @@ -165,7 +168,7 @@ Dense region caption results format: | |
| 165 | 
             
            {'\<REGION_PROPOSAL>': {'bboxes': [[x1, y1, x2, y2], ...], 
         | 
| 166 | 
             
            'labels': ['', '', ...]}}
         | 
| 167 | 
             
            ```python
         | 
| 168 | 
            -
            prompt = <REGION_PROPOSAL>
         | 
| 169 | 
             
            run_example(prompt)
         | 
| 170 | 
             
            ```
         | 
| 171 |  | 
| @@ -175,7 +178,7 @@ caption to phrase grounding task requires additional text input, i.e. caption. | |
| 175 | 
             
            Caption to phrase grounding results format: 
         | 
| 176 | 
             
            {'\<CAPTION_TO_PHRASE_GROUNDING>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
         | 
| 177 | 
             
            ```python
         | 
| 178 | 
            -
            task_prompt =  | 
| 179 | 
             
            results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
         | 
| 180 | 
             
            ```
         | 
| 181 |  | 
|  | |
| 85 | 
             
            url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
         | 
| 86 | 
             
            image = Image.open(requests.get(url, stream=True).raw)
         | 
| 87 |  | 
| 88 | 
            +
            def run_example(task_prompt, text_input=None):
         | 
| 89 | 
            +
                if text_input is None:
         | 
| 90 | 
            +
                    prompt = task_prompt
         | 
| 91 | 
            +
                else:
         | 
| 92 | 
            +
                    prompt = task_prompt + text_input
         | 
| 93 | 
             
                inputs = processor(text=prompt, images=image, return_tensors="pt")
         | 
| 94 | 
             
                generated_ids = model.generate(
         | 
| 95 | 
             
                  input_ids=inputs["input_ids"],
         | 
| 96 | 
             
                  pixel_values=inputs["pixel_values"],
         | 
| 97 | 
             
                  max_new_tokens=1024,
         | 
| 98 | 
            +
                  num_beams=3
         | 
| 99 | 
             
                )
         | 
| 100 | 
             
                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
         | 
| 101 |  | 
| 102 | 
            +
                parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
         | 
| 103 |  | 
| 104 | 
             
                print(parsed_answer)
         | 
| 105 | 
             
            ```
         | 
|  | |
| 113 | 
             
            ### OCR 
         | 
| 114 |  | 
| 115 | 
             
            ```python
         | 
| 116 | 
            +
            prompt = "<OCR>"
         | 
| 117 | 
             
            run_example(prompt)
         | 
| 118 | 
             
            ```
         | 
| 119 |  | 
|  | |
| 121 | 
             
            OCR with region output format:
         | 
| 122 | 
             
            {'\<OCR_WITH_REGION>': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}
         | 
| 123 | 
             
            ```python
         | 
| 124 | 
            +
            prompt = "<OCR_WITH_REGION>"
         | 
| 125 | 
             
            run_example(prompt)
         | 
| 126 | 
             
            ```
         | 
| 127 |  | 
| 128 | 
             
            ### Caption
         | 
| 129 | 
             
            ```python
         | 
| 130 | 
            +
            prompt = "<CAPTION>"
         | 
| 131 | 
             
            run_example(prompt)
         | 
| 132 | 
             
            ```
         | 
| 133 |  | 
| 134 | 
             
            ### Detailed Caption
         | 
| 135 | 
             
            ```python
         | 
| 136 | 
            +
            prompt = "<DETAILED_CAPTION>"
         | 
| 137 | 
             
            run_example(prompt)
         | 
| 138 | 
             
            ```
         | 
| 139 |  | 
| 140 | 
             
            ### More Detailed Caption
         | 
| 141 | 
             
            ```python
         | 
| 142 | 
            +
            prompt = "<MORE_DETAILED_CAPTION>"
         | 
| 143 | 
             
            run_example(prompt)
         | 
| 144 | 
             
            ```
         | 
| 145 |  | 
|  | |
| 150 | 
             
            'labels': ['label1', 'label2', ...]} }
         | 
| 151 |  | 
| 152 | 
             
            ```python
         | 
| 153 | 
            +
            prompt = "<OD>"
         | 
| 154 | 
             
            run_example(prompt)
         | 
| 155 | 
             
            ```
         | 
| 156 |  | 
|  | |
| 159 | 
             
            {'\<DENSE_REGION_CAPTION>' : {'bboxes': [[x1, y1, x2, y2], ...], 
         | 
| 160 | 
             
            'labels': ['label1', 'label2', ...]} }
         | 
| 161 | 
             
            ```python
         | 
| 162 | 
            +
            prompt = "<DENSE_REGION_CAPTION>"
         | 
| 163 | 
             
            run_example(prompt)
         | 
| 164 | 
             
            ```
         | 
| 165 |  | 
|  | |
| 168 | 
             
            {'\<REGION_PROPOSAL>': {'bboxes': [[x1, y1, x2, y2], ...], 
         | 
| 169 | 
             
            'labels': ['', '', ...]}}
         | 
| 170 | 
             
            ```python
         | 
| 171 | 
            +
            prompt = "<REGION_PROPOSAL>"
         | 
| 172 | 
             
            run_example(prompt)
         | 
| 173 | 
             
            ```
         | 
| 174 |  | 
|  | |
| 178 | 
             
            Caption to phrase grounding results format: 
         | 
| 179 | 
             
            {'\<CAPTION_TO_PHRASE_GROUNDING>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
         | 
| 180 | 
             
            ```python
         | 
| 181 | 
            +
            task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
         | 
| 182 | 
             
            results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
         | 
| 183 | 
             
            ```
         | 
| 184 |  | 
