Upload 8 files
Browse files- .gitattributes +7 -0
- README.md +110 -3
- qwen3-0.6B-Q3_K.gguf +3 -0
- qwen3-0.6B-Q4_0.gguf +3 -0
- qwen3-0.6B-Q4_K_M.gguf +3 -0
- qwen3-0.6B-Q5_K_M.gguf +3 -0
- qwen3-0.6B-Q8_0.gguf +3 -0
- qwen3-0.6B-f16.gguf +3 -0
- qwen3-0.6B-f32.gguf +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
qwen3-0.6B-f16.gguf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
qwen3-0.6B-f32.gguf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
qwen3-0.6B-Q3_K.gguf filter=lfs diff=lfs merge=lfs -text
|
39 |
+
qwen3-0.6B-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
|
40 |
+
qwen3-0.6B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
41 |
+
qwen3-0.6B-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
42 |
+
qwen3-0.6B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,110 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Qwen3-0.6B-T5-xxl-GGUF
|
2 |
+
|
3 |
+
## Model Description
|
4 |
+
|
5 |
+
This repository provides GGUF quantized versions of the `Qwen3-0.6B-T5-xxl` model body. These models are designed for fast, low-resource inference on CPUs.
|
6 |
+
|
7 |
+
The goal of this project is to replicate the embedding outputs of `google/t5-v1_1-xxl` using a highly optimized pipeline.
|
8 |
+
|
9 |
+
To make this repository fully functional out-of-the-box, the fine-tuned **projection head is also included**. This allows you to combine the GGUF model with the PyTorch-based head to get the final 4096-dimension embeddings.
|
10 |
+
|
11 |
+
## Repository Contents
|
12 |
+
|
13 |
+
- `qwen3-0.6B-Q4_K_M.gguf`: The model body quantized using the Q4_K_M method. (And potentially other quantizations).
|
14 |
+
- **/projection_head/projection_head.pth**: The PyTorch state dictionary for the final projection layer.
|
15 |
+
|
16 |
+
## How to Use: Hybrid GGUF + PyTorch Pipeline
|
17 |
+
|
18 |
+
This tutorial shows how to use the GGUF model for fast base embedding generation and the PyTorch head for the final projection.
|
19 |
+
|
20 |
+
### Step 1: Prerequisites
|
21 |
+
|
22 |
+
First, install the necessary libraries. `llama-cpp-python` is required to run GGUF models.
|
23 |
+
|
24 |
+
```
|
25 |
+
pip install llama-cpp-python torch numpy
|
26 |
+
```
|
27 |
+
|
28 |
+
### Step 2: Inference Script
|
29 |
+
|
30 |
+
The following script encapsulates the entire hybrid pipeline into a convenient class. You can save it as a `.py` file and import it into your projects.
|
31 |
+
|
32 |
+
```python
|
33 |
+
import torch
|
34 |
+
from torch import nn
|
35 |
+
from llama_cpp import Llama
|
36 |
+
import numpy as np
|
37 |
+
|
38 |
+
class HybridEmbedder:
|
39 |
+
"""
|
40 |
+
A class that encapsulates the hybrid embedding pipeline.
|
41 |
+
It loads the models once at initialization for optimal performance.
|
42 |
+
"""
|
43 |
+
def __init__(self, gguf_path: str, head_path: str, n_ctx: int = 512):
|
44 |
+
print("Initializing HybridEmbedder...")
|
45 |
+
|
46 |
+
# 1. Load the GGUF body
|
47 |
+
print(f"Loading GGUF body from: {gguf_path}")
|
48 |
+
self.body_model = Llama(
|
49 |
+
model_path=gguf_path,
|
50 |
+
embedding=True,
|
51 |
+
n_ctx=n_ctx,
|
52 |
+
verbose=False
|
53 |
+
)
|
54 |
+
print(" -> GGUF body loaded.")
|
55 |
+
|
56 |
+
# 2. Load the PyTorch projection head
|
57 |
+
print(f"Loading projection head from: {head_path}")
|
58 |
+
input_dim = self.body_model.n_embd()
|
59 |
+
hidden_dim = 2048
|
60 |
+
output_dim = 4096
|
61 |
+
|
62 |
+
self.head_model = nn.Sequential(
|
63 |
+
nn.Linear(input_dim, hidden_dim),
|
64 |
+
nn.GELU(),
|
65 |
+
nn.Dropout(0.1),
|
66 |
+
nn.Linear(hidden_dim, output_dim)
|
67 |
+
)
|
68 |
+
self.head_model.load_state_dict(torch.load(head_path))
|
69 |
+
self.head_model.eval()
|
70 |
+
print(" -> Projection head loaded.")
|
71 |
+
print("\n✅ Embedder is ready to use.")
|
72 |
+
|
73 |
+
def get_embedding(self, text: str) -> torch.Tensor:
|
74 |
+
# a) Get the sequence of token embeddings from the GGUF model
|
75 |
+
token_embeddings = self.body_model.embed(text)
|
76 |
+
|
77 |
+
# b) Apply Mean Pooling to get a single sentence vector
|
78 |
+
sentence_embedding = np.mean(token_embeddings, axis=0)
|
79 |
+
|
80 |
+
# c) Convert to a PyTorch tensor and add a batch dimension
|
81 |
+
sentence_tensor = torch.tensor(sentence_embedding).unsqueeze(0)
|
82 |
+
|
83 |
+
# d) Pass through the projection head
|
84 |
+
with torch.no_grad():
|
85 |
+
final_embedding = self.head_model(sentence_tensor.float())
|
86 |
+
|
87 |
+
return final_embedding
|
88 |
+
|
89 |
+
# --- Example Usage ---
|
90 |
+
if __name__ == "__main__":
|
91 |
+
# Define the paths to your local model files
|
92 |
+
GGUF_FILE = "qwen3-0.6B-Q4_K_M.gguf"
|
93 |
+
HEAD_FILE = "./projection_head/projection_head.pth"
|
94 |
+
|
95 |
+
# Create an instance of our embedder
|
96 |
+
embedder = HybridEmbedder(gguf_path=GGUF_FILE, head_path=HEAD_FILE)
|
97 |
+
|
98 |
+
# Use the embedder to get vectors
|
99 |
+
prompt = "A sprawling fantasy city built into a giant tree."
|
100 |
+
embedding = embedder.get_embedding(prompt)
|
101 |
+
|
102 |
+
print("\n--- Inference Test ---")
|
103 |
+
print(f"Prompt: '{prompt}'")
|
104 |
+
print(f"Output dimension: {embedding.shape}")
|
105 |
+
print(f"Vector excerpt: {embedding[0, :5]}...")
|
106 |
+
```
|
107 |
+
|
108 |
+
## License
|
109 |
+
|
110 |
+
This repository is licensed under the **MIT License**.
|
qwen3-0.6B-Q3_K.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:148dcefa5526e89a80f694dbafaa5b250bb00b1b3f6e68bc009722dd7a629796
|
3 |
+
size 346896032
|
qwen3-0.6B-Q4_0.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ffaad38b7334551e0ffba91f3cc08017a1ee418b8e40f8b61048ee20e41da23
|
3 |
+
size 381335200
|
qwen3-0.6B-Q4_K_M.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d058f00eeba52a245283d1a3ff8c13e91c6e6ced1f0363f37c620c3e34316ee
|
3 |
+
size 396474016
|
qwen3-0.6B-Q5_K_M.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:110961bdcc0c3106d29c5c976df1712bf1999c93ac828d3ed457b7d326112e7d
|
3 |
+
size 444184224
|
qwen3-0.6B-Q8_0.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:739d45f900f3fe7af64d174de5646a6fb78aa8e4bb1fde5e1a15f82cbce0496b
|
3 |
+
size 639150048
|
qwen3-0.6B-f16.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3dd44d5dc85e73b91bc423092e3b6121c4b97195ad129c9359c575c3929c32ad
|
3 |
+
size 1197629088
|
qwen3-0.6B-f32.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:155984a1ddc3e652a50cff9504e1e2dba1529e9fe5e062519139e4cf5ce80dc4
|
3 |
+
size 2389051040
|