Safetensors
qwen3
ehartford commited on
Commit
22a246a
·
verified ·
1 Parent(s): b20d7fe

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +108 -29
README.md CHANGED
@@ -85,55 +85,134 @@ Output: "deoxyribonucleic acid, and it is the hereditary material in all living
85
 
86
  ## Usage
87
 
88
- ### Basic Usage
89
  ```python
90
  from transformers import AutoModelForCausalLM, AutoTokenizer
91
 
92
- # Load model
 
 
 
93
  model = AutoModelForCausalLM.from_pretrained(
94
- "Qwen3-72B-Embiggened",
95
- torch_dtype=torch.bfloat16,
96
- device_map="auto",
97
- trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
- tokenizer = AutoTokenizer.from_pretrained("Qwen3-72B-Embiggened")
100
 
101
- # Generate text
102
- inputs = tokenizer("The meaning of life is", return_tensors="pt")
103
- outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.7)
104
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
 
 
 
 
 
 
 
 
 
105
  ```
106
 
107
- ### Advanced Usage with Quantization
108
  ```python
109
- from transformers import BitsAndBytesConfig
110
 
111
- # 4-bit quantization for reduced memory usage
112
- bnb_config = BitsAndBytesConfig(
113
- load_in_4bit=True,
114
- bnb_4bit_compute_dtype=torch.bfloat16,
115
- bnb_4bit_use_double_quant=True,
 
116
  )
 
117
 
118
- model = AutoModelForCausalLM.from_pretrained(
119
- "Qwen3-72B-Embiggened",
120
- quantization_config=bnb_config,
121
- device_map="auto",
122
- trust_remote_code=True
 
 
 
123
  )
124
  ```
125
 
126
- ### vLLM Deployment
127
  ```python
128
- from vllm import LLM, SamplingParams
 
 
 
 
 
 
129
 
130
- llm = LLM(model="Qwen3-72B-Embiggened", tensor_parallel_size=4)
131
- sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=100)
 
 
 
 
 
132
 
133
- prompts = ["Tell me about quantum computing", "Write a poem about AI"]
134
- outputs = llm.generate(prompts, sampling_params)
 
 
 
 
135
  ```
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  ## Hardware Requirements
138
 
139
  ### Minimum Requirements
 
85
 
86
  ## Usage
87
 
88
+ ### Basic Usage with Thinking Mode
89
  ```python
90
  from transformers import AutoModelForCausalLM, AutoTokenizer
91
 
92
+ model_name = "cognitivecomputations/Qwen3-72B-Embiggened"
93
+
94
+ # Load the tokenizer and the model
95
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
96
  model = AutoModelForCausalLM.from_pretrained(
97
+ model_name,
98
+ torch_dtype="auto",
99
+ device_map="auto"
100
+ )
101
+
102
+ # Prepare the model input
103
+ prompt = "How many r's are in strawberry?"
104
+ messages = [
105
+ {"role": "user", "content": prompt}
106
+ ]
107
+
108
+ # Apply chat template with thinking mode enabled
109
+ text = tokenizer.apply_chat_template(
110
+ messages,
111
+ tokenize=False,
112
+ add_generation_prompt=True,
113
+ enable_thinking=True # Enable thinking mode (default)
114
+ )
115
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
116
+
117
+ # Generate response
118
+ generated_ids = model.generate(
119
+ **model_inputs,
120
+ max_new_tokens=32768,
121
+ temperature=0.6, # Recommended for thinking mode
122
+ top_p=0.95,
123
+ top_k=20,
124
+ min_p=0
125
  )
 
126
 
127
+ # Parse thinking content and final response
128
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
129
+
130
+ try:
131
+ # Find </think> token (151668)
132
+ index = len(output_ids) - output_ids[::-1].index(151668)
133
+ except ValueError:
134
+ index = 0
135
+
136
+ thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
137
+ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
138
+
139
+ print("Thinking content:", thinking_content)
140
+ print("Final answer:", content)
141
  ```
142
 
143
+ ### Non-Thinking Mode (Efficient General Dialogue)
144
  ```python
145
+ # Same setup as above...
146
 
147
+ # Apply chat template with thinking mode disabled
148
+ text = tokenizer.apply_chat_template(
149
+ messages,
150
+ tokenize=False,
151
+ add_generation_prompt=True,
152
+ enable_thinking=False # Disable thinking for efficiency
153
  )
154
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
155
 
156
+ # Generate with non-thinking parameters
157
+ outputs = model.generate(
158
+ **model_inputs,
159
+ max_new_tokens=2048,
160
+ temperature=0.7, # Recommended for non-thinking mode
161
+ top_p=0.8,
162
+ top_k=20,
163
+ min_p=0
164
  )
165
  ```
166
 
167
+ ### Advanced: Dynamic Mode Switching
168
  ```python
169
+ # Use /think and /no_think tags to control behavior
170
+ messages = [
171
+ {"role": "user", "content": "Explain quantum computing /no_think"}, # Quick response
172
+ {"role": "assistant", "content": "Quantum computing uses quantum bits..."},
173
+ {"role": "user", "content": "How does superposition work mathematically? /think"} # Detailed reasoning
174
+ ]
175
+ ```
176
 
177
+ ### vLLM Deployment with Reasoning Support
178
+ ```python
179
+ # Start server with reasoning parser
180
+ # vllm serve cognitivecomputations/Qwen3-72B-Embiggened --enable-reasoning --reasoning-parser deepseek_r1
181
+
182
+ from openai import OpenAI
183
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
184
 
185
+ # Use with thinking mode
186
+ response = client.chat.completions.create(
187
+ model="cognitivecomputations/Qwen3-72B-Embiggened",
188
+ messages=[{"role": "user", "content": "Solve: What is 15% of 250?"}],
189
+ extra_body={"enable_thinking": True}
190
+ )
191
  ```
192
 
193
+ ### Example Outputs with Thinking
194
+
195
+ ```
196
+ Prompt: "How many r's are in strawberry?"
197
+ Thinking: Let me count the r's in "strawberry". S-t-r-a-w-b-e-r-r-y.
198
+ Going through each letter: s(no), t(no), r(yes, 1), a(no), w(no),
199
+ b(no), e(no), r(yes, 2), r(yes, 3), y(no).
200
+ Final answer: There are 3 r's in the word "strawberry".
201
+
202
+ Prompt: "What is the capital of France, and what is it famous for?"
203
+ Final answer (no thinking): Paris is the capital of France. It's famous for
204
+ the Eiffel Tower, the Louvre Museum, Notre-Dame Cathedral, and its rich
205
+ cultural heritage, fashion, and cuisine.
206
+ ```
207
+
208
+ This updated version:
209
+ 1. Shows both thinking and non-thinking modes clearly
210
+ 2. Includes the proper thinking token parsing (151668)
211
+ 3. Uses recommended temperature settings for each mode
212
+ 4. Demonstrates the `/think` and `/no_think` switches
213
+ 5. Shows example outputs that highlight the thinking capability
214
+ 6. Matches the structure and style of the Qwen3-32B examples
215
+
216
  ## Hardware Requirements
217
 
218
  ### Minimum Requirements