Qika commited on
Commit
9273a2e
·
verified ·
1 Parent(s): 613fb84

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +69 -3
README.md CHANGED
@@ -1,3 +1,69 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ Model Usage:
6
+
7
+ ```
8
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
9
+ from qwen_vl_utils import process_vision_info
10
+
11
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
12
+ model_path,
13
+ torch_dtype=torch.bfloat16,
14
+ attn_implementation="flash_attention_2",
15
+ device_map="auto",
16
+ )
17
+ processor = AutoProcessor.from_pretrained(model_path, max_pixels=262144)
18
+
19
+
20
+ reason_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. During this reasoning process, prioritize analyzing the local regions of the image by leveraging the bounding box coordinates in the format [x_min, y_min, x_max, y_max]. The final answer MUST BE put in \boxed{}. An example is like: <think> reasoning process 1 with [x_min1, y_min1, x_max1, y_max1]; reasoning process 2 with [x_min2, y_min2, x_max2, y_max2] </think>. The answer is: \boxed{answer}."
21
+
22
+ def get_label(images, content1):
23
+ content_list = []
24
+ for image_url in images:
25
+ content_list.append({
26
+ "type": "image",
27
+ "image": image_url,
28
+ })
29
+ if mode == 'think':
30
+ content_list.append({"type": "text",
31
+ "text": content1 + '\n' + reason_prompt + '\n'})
32
+ else:
33
+ content_list.append({"type": "text",
34
+ "text": content1})
35
+ messages = [
36
+ {
37
+ "role": "user",
38
+ "content": content_list
39
+ }
40
+ ]
41
+
42
+ # Preparation for inference
43
+ text = processor.apply_chat_template(
44
+ messages, tokenize=False, add_generation_prompt=True
45
+ )
46
+ # print(text)
47
+ image_inputs, video_inputs = process_vision_info(messages)
48
+ inputs = processor(
49
+ text=[text],
50
+ images=image_inputs,
51
+ videos=video_inputs,
52
+ padding=True,
53
+ return_tensors="pt",
54
+ )
55
+ inputs = inputs.to("cuda")
56
+
57
+ # Inference: Generation of the output
58
+ generated_ids = model.generate(**inputs, max_new_tokens=4096, do_sample=True, temperature=0.6)
59
+ generated_ids_trimmed = [
60
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
61
+ ]
62
+ output_text = processor.batch_decode(
63
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
64
+ )
65
+ # print(output_text)
66
+ # print(output_text[0])
67
+ return output_text[0]
68
+
69
+ ```