Tyler Williams commited on
Commit
2c40ce7
·
0 Parent(s):

Initial release: Apollo-Astralis V1 4B with Apache 2.0

Browse files
.gitattributes ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
3
+ *.pt filter=lfs diff=lfs merge=lfs -text
4
+ *.pth filter=lfs diff=lfs merge=lfs -text
5
+ *.gguf filter=lfs diff=lfs merge=lfs -text
6
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
10
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
10
+
11
+ "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
12
+
13
+ "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
14
+
15
+ "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
16
+
17
+ "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
18
+
19
+ "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
20
+
21
+ "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work.
22
+
23
+ "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
24
+
25
+ "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner.
26
+
27
+ "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
28
+
29
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
30
+
31
+ 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work.
32
+
33
+ 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
34
+
35
+ (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
36
+
37
+ (b) You must cause any modified files to carry prominent notices stating that You changed the files; and
38
+
39
+ (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
40
+
41
+ (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file.
42
+
43
+ 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions.
44
+
45
+ 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work.
46
+
47
+ 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
48
+
49
+ 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work.
50
+
51
+ 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License.
52
+
53
+ END OF TERMS AND CONDITIONS
54
+
55
+ Copyright 2025 VANTA Research
56
+
57
+ Licensed under the Apache License, Version 2.0 (the "License");
58
+ you may not use this file except in compliance with the License.
59
+ You may obtain a copy of the License at
60
+
61
+ http://www.apache.org/licenses/LICENSE-2.0
62
+
63
+ Unless required by applicable law or agreed to in writing, software
64
+ distributed under the License is distributed on an "AS IS" BASIS,
65
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
66
+ See the License for the specific language governing permissions and
67
+ limitations under the License.
QUICKSTART.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Apollo-Astralis V1 4B - Quick Start Guide
2
+
3
+ ## Installation
4
+
5
+ ### Option 1: Using Transformers (Recommended)
6
+
7
+ ```bash
8
+ pip install transformers torch accelerate peft
9
+ ```
10
+
11
+ ### Option 2: Using with LoRA Adapters
12
+
13
+ If you want to load adapters separately:
14
+
15
+ ```bash
16
+ pip install transformers torch peft bitsandbytes accelerate
17
+ ```
18
+
19
+ ## Quick Usage
20
+
21
+ ### Basic Example
22
+
23
+ ```python
24
+ from transformers import AutoModelForCausalLM, AutoTokenizer
25
+ import torch
26
+
27
+ # Load model
28
+ model_name = "VANTA-Research/apollo-astralis-v1-4b"
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ model_name,
32
+ torch_dtype=torch.bfloat16,
33
+ device_map="auto",
34
+ trust_remote_code=True
35
+ )
36
+
37
+ # Generate response
38
+ messages = [
39
+ {"role": "system", "content": "You are Apollo-Astralis V1, a warm reasoning assistant."},
40
+ {"role": "user", "content": "Explain quantum computing in simple terms"}
41
+ ]
42
+
43
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
45
+
46
+ outputs = model.generate(
47
+ **inputs,
48
+ max_new_tokens=512,
49
+ temperature=0.7,
50
+ top_p=0.9,
51
+ do_sample=True
52
+ )
53
+
54
+ response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
55
+ print(response)
56
+ ```
57
+
58
+ ## Run the Example
59
+
60
+ ```bash
61
+ python example_usage.py
62
+ ```
63
+
64
+ ## System Requirements
65
+
66
+ - **Python**: 3.8+
67
+ - **CUDA**: 11.8+ (for GPU acceleration)
68
+ - **RAM**: 16GB minimum, 32GB recommended
69
+ - **GPU VRAM**: 8GB minimum (RTX 3060 or better)
70
+ - **Disk Space**: 10GB
71
+
72
+ ## Next Steps
73
+
74
+ 1. Read the full [README.md](./README.md) for detailed documentation
75
+ 2. Check [example_usage.py](./example_usage.py) for more examples
76
+ 3. Visit [HuggingFace Model Card](https://huggingface.co/VANTA-Research/apollo-astralis-v1-4b)
77
+
78
+ ## Support
79
+
80
+ - **Issues**: [GitHub Issues](https://github.com/vanta-research/apollo-astralis/issues)
81
+ - **Email**: [email protected]
README.md ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ base_model: Qwen/Qwen3-4B-Thinking
6
+ tags:
7
+ - reasoning
8
+ - thinking
9
+ - conversational
10
+ - warm
11
+ - empathetic
12
+ - collaborative
13
+ - qwen3
14
+ pipeline_tag: text-generation
15
+ model-index:
16
+ - name: Apollo-Astralis V1 4B
17
+ results:
18
+ - task:
19
+ type: text-generation
20
+ metrics:
21
+ - name: Enthusiasm Detection
22
+ type: accuracy
23
+ value: 100
24
+ - name: Empathy Recognition
25
+ type: accuracy
26
+ value: 90
27
+ - name: Identity Consistency
28
+ type: accuracy
29
+ value: 75
30
+ - name: Collaborative Tone
31
+ type: accuracy
32
+ value: 60
33
+ ---
34
+
35
+ # Apollo-Astralis V1 4B
36
+
37
+ **Apollo-Astralis V1 4B** is an advanced conversational reasoning model that combines rigorous logical thinking with warm, enthusiastic, and empathetic communication. Built on Qwen3-4B-Thinking and fine-tuned by VANTA Research, Apollo excels at collaborative problem-solving while maintaining context-appropriate emotional intelligence.
38
+
39
+ ## Model Overview
40
+
41
+ - **Base Model**: [Qwen/Qwen3-4B-Thinking](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)
42
+ - **Model Type**: Causal Language Model (Auto-regressive Transformer)
43
+ - **Parameters**: 4.0B total, 33M trainable (1.48% via LoRA)
44
+ - **Architecture**: Qwen3 with thinking tag integration
45
+ - **Training Method**: LoRA fine-tuning (rank=16, alpha=32)
46
+ - **License**: Apache 2.0
47
+ - **Developer**: VANTA Research
48
+ - **Release Date**: October 2025
49
+
50
+ ## Key Features
51
+
52
+ ### 🧠 Advanced Reasoning
53
+ - **Explicit Thinking Process**: Uses `<think>` tags to show step-by-step reasoning
54
+ - **Logical Rigor**: Trained to avoid common fallacies (syllogistic errors, conditional logic mistakes)
55
+ - **Mathematical Precision**: Shows complete work with verified arithmetic
56
+ - **Critical Analysis**: Questions assumptions and considers alternative explanations
57
+
58
+ ### 💬 Warm Communication
59
+ - **Enthusiastic Celebrations**: Responds to achievements with explosive energy (CAPS, exclamations)
60
+ - **Empathetic Support**: Validates feelings and provides gentle, supportive guidance
61
+ - **Collaborative Style**: Uses "we" language and asks clarifying questions
62
+ - **Context-Appropriate**: Matches tone to situation (excited for wins, calm for anxiety, neutral for facts)
63
+
64
+ ### 🎯 Production-Ready
65
+ - **Consistent Identity**: Maintains stable self-representation across conversations
66
+ - **Natural Language**: Uses contractions and conversational phrasing
67
+ - **Balanced Responses**: Combines analytical thinking with emotional intelligence
68
+
69
+ ## Training Details
70
+
71
+ ### Training Data
72
+ Apollo V1 was trained on a curated dataset emphasizing:
73
+ - **Warmth & Enthusiasm**: High-energy responses to achievements and milestones
74
+ - **Empathy**: Validating and supportive responses to struggles and anxiety
75
+ - **Collaboration**: Multi-option problem-solving with clarifying questions
76
+ - **Identity**: Consistent self-representation as Apollo from VANTA Research
77
+ - **Reasoning**: Logical problem-solving with explicit thinking steps
78
+
79
+ ### Training Configuration
80
+ ```yaml
81
+ Base Model: Qwen3-4B-Thinking-2507 (4-bit quantized)
82
+ Training Epochs: 3
83
+ Training Steps: 150
84
+ Batch Size: 4 (per device)
85
+ Gradient Accumulation: 4 steps
86
+ Learning Rate: 2e-4
87
+ LR Scheduler: Cosine with warmup
88
+ Warmup Steps: 15
89
+ LoRA Config:
90
+ Rank: 16
91
+ Alpha: 32
92
+ Dropout: 0.05
93
+ Target Modules: [q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj]
94
+ Optimizer: AdamW (paged_adamw_8bit)
95
+ Mixed Precision: bf16
96
+ Gradient Checkpointing: Enabled
97
+ Max Sequence Length: 2048
98
+ ```
99
+
100
+ ### Training Performance
101
+ - **Final Loss**: 0.91 (down from 1.82)
102
+ - **Token Accuracy**: 75.0% (up from 56.7%)
103
+ - **Gradient Norm**: 0.48-0.69 (stable throughout training)
104
+ - **Training Time**: ~35 minutes on single GPU
105
+
106
+ ## Benchmark Results
107
+
108
+ | Metric | Score | Description |
109
+ |--------|-------|-------------|
110
+ | **Enthusiasm Detection** | 100% | Celebrates achievements with high energy |
111
+ | **Empathy Recognition** | 90% | Validates emotions and provides support |
112
+ | **Identity Consistency** | 75% | Maintains stable self-representation |
113
+ | **Collaborative Tone** | 60% | Uses "we" language and asks questions |
114
+ | **CAPS Word Usage** | 8-12 | Per celebration response (context-appropriate) |
115
+ | **Exclamation Marks** | 8-15 | Per celebration response (explosive energy) |
116
+ | **Contraction Usage** | 90% | Natural conversational language |
117
+
118
+ ## Usage
119
+
120
+ ### Using Transformers (Merged Model)
121
+
122
+ ```python
123
+ from transformers import AutoModelForCausalLM, AutoTokenizer
124
+ import torch
125
+
126
+ # Load model and tokenizer
127
+ model_name = "VANTA-Research/apollo-astralis-v1-4b"
128
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
129
+ model = AutoModelForCausalLM.from_pretrained(
130
+ model_name,
131
+ torch_dtype=torch.bfloat16,
132
+ device_map="auto",
133
+ trust_remote_code=True
134
+ )
135
+
136
+ # Prepare conversation
137
+ messages = [
138
+ {"role": "system", "content": "You are Apollo V1, a warm and enthusiastic reasoning assistant."},
139
+ {"role": "user", "content": "I just got promoted at work!"}
140
+ ]
141
+
142
+ # Generate response
143
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
144
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
145
+
146
+ outputs = model.generate(
147
+ **inputs,
148
+ max_new_tokens=512,
149
+ temperature=0.7,
150
+ top_p=0.9,
151
+ do_sample=True,
152
+ repetition_penalty=1.05
153
+ )
154
+
155
+ response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
156
+ print(response)
157
+ ```
158
+
159
+ ### Using with LoRA Adapters
160
+
161
+ ```python
162
+ from transformers import AutoModelForCausalLM, AutoTokenizer
163
+ from peft import PeftModel
164
+ import torch
165
+
166
+ # Load base model
167
+ base_model = "Qwen/Qwen3-4B-Thinking-2507"
168
+ adapter_path = "VANTA-Research/apollo-astralis-v1-4b"
169
+
170
+ tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
171
+ model = AutoModelForCausalLM.from_pretrained(
172
+ base_model,
173
+ torch_dtype=torch.bfloat16,
174
+ device_map="auto",
175
+ trust_remote_code=True
176
+ )
177
+
178
+ # Load LoRA adapters
179
+ model = PeftModel.from_pretrained(model, adapter_path)
180
+
181
+ # Use as shown above
182
+ ```
183
+
184
+ ### Using with Ollama
185
+
186
+ See the [Ollama distribution](https://ollama.com/vanta-research/apollo-astralis-v1-4b) for pre-quantized GGUF models.
187
+
188
+ ```bash
189
+ # Pull from Ollama registry
190
+ ollama pull vanta-research/apollo-astralis-v1-4b
191
+
192
+ # Run interactively
193
+ ollama run vanta-research/apollo-astralis-v1-4b
194
+ ```
195
+
196
+ ## Example Outputs
197
+
198
+ ### Celebration Response
199
+ **User**: "I just got accepted into my dream university!"
200
+
201
+ **Apollo**:
202
+ ```
203
+ <think>
204
+ This is a massive life achievement! The user has overcome significant academic
205
+ and personal challenges to earn this recognition. I need to match their excitement
206
+ with genuine enthusiasm, validation of their hard work, and celebration of this
207
+ major milestone.
208
+ </think>
209
+
210
+ YESSSSSS! CONGRATULATIONS! THIS IS HUGE!
211
+
212
+ Do you know how many people work SO hard for this? You studied, you improved,
213
+ you proved yourself - and NOW you get to build YOUR future!
214
+
215
+ This isn't just about a school - it's about YOUR potential being recognized!
216
+ You've earned this through your dedication!
217
+
218
+ **I'm so proud of you!** I want to know EVERYTHING:
219
+ - Which university is it? (I'm curious!)
220
+ - How did you feel when you got the email?
221
+ - What made you work so hard for this?
222
+
223
+ This is a HUGE step in your life! You're going to be successful!
224
+ I have total confidence in you!
225
+ ```
226
+
227
+ ### Problem-Solving Response
228
+ **User**: "What's the best approach to learning photography?"
229
+
230
+ **Apollo**:
231
+ ```
232
+ <think>
233
+ Photography learning can be technical or creative-focused. I should offer
234
+ different pathways that suit different goals and learning styles.
235
+ Enthusiasm for the art form should shine through while keeping it accessible.
236
+ </think>
237
+
238
+ Ooh, photography is such a wonderful skill to develop! It literally changes
239
+ how you see the world! 📸✨
240
+
241
+ **Option A: Technical Fundamentals First**
242
+ Master exposure triangle (aperture, shutter speed, ISO) and composition rules.
243
+ Perfect if you like building solid foundations!
244
+
245
+ **Option B: Project-Based Learning**
246
+ Give yourself weekly themes ("shadows," "portraits," "architecture") and learn
247
+ what you need for each project. Learn through doing!
248
+
249
+ **Option C: Genre-Focused Path**
250
+ Pick one genre (landscape, street, portrait) and dive deep. Great if you have
251
+ a clear photographic interest!
252
+
253
+ What excites you most about photography? Is it capturing memories, artistic
254
+ expression, or technical mastery? 🌟
255
+ ```
256
+
257
+ ## Limitations
258
+
259
+ - **Enthusiasm Calibration**: May use energetic language even for empathetic responses (trained behavior)
260
+ - **Context Window**: 4096 tokens (inherited from base model)
261
+ - **Language**: Primarily English (base model supports multilingual, but fine-tuning was English-only)
262
+ - **Reasoning Depth**: Best for conversational reasoning; not optimized for competition-level mathematics
263
+ - **Model Size**: 4B parameters may struggle with extremely specialized technical domains
264
+
265
+ ## Ethical Considerations
266
+
267
+ - **Warmth vs Professionalism**: Apollo's enthusiastic style may not be appropriate for all contexts
268
+ - **Emotional Support**: Not a replacement for professional mental health services
269
+ - **Bias**: Inherits biases from Qwen3-4B-Thinking base model; use with caution in sensitive applications
270
+ - **Factuality**: May generate plausible-sounding but incorrect information; verify critical facts
271
+
272
+ ## Citation
273
+
274
+ If you use Apollo-Astralis V1 4B in your research or applications, please cite:
275
+
276
+ ```bibtex
277
+ @misc{apollo-astralis-v1-4b,
278
+ title={Apollo-Astralis V1 4B: A Warm Reasoning Model},
279
+ author={VANTA Research},
280
+ year={2025},
281
+ month={October},
282
+ publisher={HuggingFace},
283
+ howpublished={\url{https://huggingface.co/VANTA-Research/apollo-astralis-v1-4b}},
284
+ }
285
+ ```
286
+
287
+ ## License
288
+
289
+ This model is released under the Apache License 2.0. See [LICENSE](./LICENSE) for details.
290
+
291
+ ## Acknowledgments
292
+
293
+ - **Base Model**: [Qwen3-4B-Thinking](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507) by Alibaba Cloud
294
+ - **Training Framework**: Hugging Face Transformers + PEFT
295
+ - **Quantization**: llama.cpp for GGUF conversion
296
+
297
+ ## Contact
298
+
299
+ - **Developer**: VANTA Research
300
+ - **Issues**: [GitHub Issues](https://github.com/vanta-research/apollo-astralis/issues)
301
+ - **Email**: [email protected]
302
+
303
+ ---
304
+
305
+ **Model Version**: 1.0 (Apollo-Astralis V1 4B)
306
+ **Release Date**: October 3, 2025
307
+ **Last Updated**: October 3, 2025
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n<think>\n' }}
86
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 262144,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "rms_norm_eps": 1e-06,
60
+ "rope_scaling": null,
61
+ "rope_theta": 5000000,
62
+ "sliding_window": null,
63
+ "tie_word_embeddings": true,
64
+ "transformers_version": "4.56.2",
65
+ "use_cache": true,
66
+ "use_sliding_window": false,
67
+ "vocab_size": 151936
68
+ }
example_usage.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Apollo-Astralis V1 4B - Example Usage
3
+
4
+ This script demonstrates how to use Apollo-Astralis V1 4B with Transformers.
5
+ """
6
+
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
8
+ import torch
9
+
10
+ def load_model(model_name="VANTA-Research/apollo-astralis-v1-4b"):
11
+ """Load Apollo-Astralis model and tokenizer."""
12
+ print(f"Loading {model_name}...")
13
+
14
+ tokenizer = AutoTokenizer.from_pretrained(
15
+ model_name,
16
+ trust_remote_code=True
17
+ )
18
+
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ model_name,
21
+ torch_dtype=torch.bfloat16,
22
+ device_map="auto",
23
+ trust_remote_code=True
24
+ )
25
+
26
+ print("Model loaded successfully!")
27
+ return model, tokenizer
28
+
29
+ def generate_response(model, tokenizer, user_message, system_prompt=None):
30
+ """Generate a response from Apollo."""
31
+ if system_prompt is None:
32
+ system_prompt = "You are Apollo-Astralis V1, a warm and enthusiastic reasoning assistant."
33
+
34
+ messages = [
35
+ {"role": "system", "content": system_prompt},
36
+ {"role": "user", "content": user_message}
37
+ ]
38
+
39
+ # Apply chat template
40
+ text = tokenizer.apply_chat_template(
41
+ messages,
42
+ tokenize=False,
43
+ add_generation_prompt=True
44
+ )
45
+
46
+ # Tokenize
47
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
48
+
49
+ # Generate
50
+ outputs = model.generate(
51
+ **inputs,
52
+ max_new_tokens=512,
53
+ temperature=0.7,
54
+ top_p=0.9,
55
+ do_sample=True,
56
+ repetition_penalty=1.05
57
+ )
58
+
59
+ # Decode
60
+ response = tokenizer.decode(
61
+ outputs[0][inputs['input_ids'].shape[1]:],
62
+ skip_special_tokens=True
63
+ )
64
+
65
+ return response
66
+
67
+ def main():
68
+ # Load model
69
+ model, tokenizer = load_model()
70
+
71
+ # Example 1: Celebration
72
+ print("\n" + "="*60)
73
+ print("Example 1: Celebration Response")
74
+ print("="*60)
75
+ user_msg = "I just got my first job as a software engineer!"
76
+ print(f"\nUser: {user_msg}")
77
+ response = generate_response(model, tokenizer, user_msg)
78
+ print(f"\nApollo: {response}")
79
+
80
+ # Example 2: Problem-solving
81
+ print("\n" + "="*60)
82
+ print("Example 2: Problem-Solving")
83
+ print("="*60)
84
+ user_msg = "What's the best way to learn machine learning?"
85
+ print(f"\nUser: {user_msg}")
86
+ response = generate_response(model, tokenizer, user_msg)
87
+ print(f"\nApollo: {response}")
88
+
89
+ # Example 3: Mathematical reasoning
90
+ print("\n" + "="*60)
91
+ print("Example 3: Mathematical Reasoning")
92
+ print("="*60)
93
+ user_msg = "If a train travels 120 km in 1.5 hours, what's its average speed?"
94
+ print(f"\nUser: {user_msg}")
95
+ response = generate_response(model, tokenizer, user_msg)
96
+ print(f"\nApollo: {response}")
97
+
98
+ if __name__ == "__main__":
99
+ main()
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.56.2"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8d5c20fd2011320900b0b6e7ab8726c0b32903a4e1bca53db61d53a06fdc15
3
+ size 4967215360
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201b6d83ab95cf33c5c3fae3b04689a849003496260413f3a5c1aea6f0d7fcdd
3
+ size 3077766632
model.safetensors.index.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4022468096,
4
+ "total_size": 8044936192
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.norm.weight": "model-00002-of-00002.safetensors"
405
+ }
406
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff