littlebird13 commited on
Commit
b22da49
·
verified ·
1 Parent(s): a579a21

Automatically add EOS via Tokenizer, integrate Sentence Transformers (#2)

Browse files

- Automatically add EOS via Tokenizer, integrate Sentence Transformers (4ee1aa535260ccb3eb883af36bac4314e0c3bbf7)
- Add "device_map": "auto" to automatically move the model to CUDA if possible (2f6ecfd654caa7ef022fa415b53ede194322d068)
- Remove eod_id line from README (2653833944d8c3c22a0139b6174051f7ce72879f)

1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1024,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": true,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -2,7 +2,11 @@
2
  license: apache-2.0
3
  base_model:
4
  - Qwen/Qwen3-0.6B-Base
5
- library_name: transformers
 
 
 
 
6
  ---
7
  # Qwen3-Embedding-0.6B
8
 
@@ -54,6 +58,47 @@ With Transformers versions earlier than 4.51.0, you may encounter the following
54
  KeyError: 'qwen3'
55
  ```
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  ### Transformers Usage
58
 
59
  ```python
@@ -80,14 +125,6 @@ def last_token_pool(last_hidden_states: Tensor,
80
  def get_detailed_instruct(task_description: str, query: str) -> str:
81
  return f'Instruct: {task_description}\nQuery:{query}'
82
 
83
- def tokenize(tokenizer, input_texts, eod_id, max_length):
84
- batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
85
- for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
86
- seq.append(eod_id)
87
- att.append(1)
88
- batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
89
- return batch_dict
90
-
91
  # Each query must come with a one-sentence instruction that describes the task
92
  task = 'Given a web search query, retrieve relevant passages that answer the query'
93
 
@@ -108,11 +145,16 @@ model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
108
  # We recommend enabling flash_attention_2 for better acceleration and memory saving.
109
  # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
110
 
111
- eod_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
112
  max_length = 8192
113
 
114
  # Tokenize the input texts
115
- batch_dict = tokenize(tokenizer, input_texts, eod_id, max_length)
 
 
 
 
 
 
116
  batch_dict.to(model.device)
117
  outputs = model(**batch_dict)
118
  embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
@@ -121,6 +163,7 @@ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_ma
121
  embeddings = F.normalize(embeddings, p=2, dim=1)
122
  scores = (embeddings[:2] @ embeddings[2:].T)
123
  print(scores.tolist())
 
124
  ```
125
  📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.
126
 
 
2
  license: apache-2.0
3
  base_model:
4
  - Qwen/Qwen3-0.6B-Base
5
+ tags:
6
+ - transformers
7
+ - sentence-transformers
8
+ - sentence-similarity
9
+ - feature-extraction
10
  ---
11
  # Qwen3-Embedding-0.6B
12
 
 
58
  KeyError: 'qwen3'
59
  ```
60
 
61
+ ### Sentence Transformers Usage
62
+
63
+ ```python
64
+ # Requires transformers>=4.51.0
65
+
66
+ from sentence_transformers import SentenceTransformer
67
+
68
+ # Load the model
69
+ model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
70
+
71
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving,
72
+ # together with setting `padding_side` to "left":
73
+ # model = SentenceTransformer(
74
+ # "Qwen/Qwen3-Embedding-0.6B",
75
+ # model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
76
+ # tokenizer_kwargs={"padding_side": "left"},
77
+ # )
78
+
79
+ # The queries and documents to embed
80
+ queries = [
81
+ "What is the capital of China?",
82
+ "Explain gravity",
83
+ ]
84
+ documents = [
85
+ "The capital of China is Beijing.",
86
+ "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
87
+ ]
88
+
89
+ # Encode the queries and documents. Note that queries benefit from using a prompt
90
+ # Here we use the prompt called "query" stored under `model.prompts`, but you can
91
+ # also pass your own prompt via the `prompt` argument
92
+ query_embeddings = model.encode(queries, prompt_name="query")
93
+ document_embeddings = model.encode(documents)
94
+
95
+ # Compute the (cosine) similarity between the query and document embeddings
96
+ similarity = model.similarity(query_embeddings, document_embeddings)
97
+ print(similarity)
98
+ # tensor([[0.7646, 0.1414],
99
+ # [0.1355, 0.6000]])
100
+ ```
101
+
102
  ### Transformers Usage
103
 
104
  ```python
 
125
  def get_detailed_instruct(task_description: str, query: str) -> str:
126
  return f'Instruct: {task_description}\nQuery:{query}'
127
 
 
 
 
 
 
 
 
 
128
  # Each query must come with a one-sentence instruction that describes the task
129
  task = 'Given a web search query, retrieve relevant passages that answer the query'
130
 
 
145
  # We recommend enabling flash_attention_2 for better acceleration and memory saving.
146
  # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
147
 
 
148
  max_length = 8192
149
 
150
  # Tokenize the input texts
151
+ batch_dict = tokenizer(
152
+ input_texts,
153
+ padding=True,
154
+ truncation=True,
155
+ max_length=max_length,
156
+ return_tensors="pt",
157
+ )
158
  batch_dict.to(model.device)
159
  outputs = model(**batch_dict)
160
  embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
 
163
  embeddings = F.normalize(embeddings, p=2, dim=1)
164
  scores = (embeddings[:2] @ embeddings[2:].T)
165
  print(scores.tolist())
166
+ # [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
167
  ```
168
  📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.
169
 
config_sentence_transformers.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompts": {
3
+ "query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:",
4
+ "document": ""
5
+ },
6
+ "default_prompt_name": null,
7
+ "similarity_fn_name": "cosine"
8
+ }
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:def76fb086971c7867b829c23a26261e38d9d74e02139253b38aeb9df8b4b50a
3
+ size 11423705