different embedding ?
how can i load this model using automodel.from like others emebdding model
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
class HFEmbeddingModel:
def init(self, model_name_or_path, device=None, normalize=True, pooling="mean"):
self.model_name_or_path = model_name_or_path
self.device = device if device is not None else ("cuda" if torch.cuda.is_available() else "cpu")
self.normalize = normalize
self.pooling = pooling # "mean", "cls", or others you want to support
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModel.from_pretrained(model_name_or_path)
self.model.to(self.device)
self.model.eval()
@torch
.no_grad()
def embed(self, texts, max_length=128):
"""
texts: str or list of str
returns: tensor of shape (len(texts), hidden_size)
"""
if isinstance(texts, str):
texts = [texts]
enc = self.tokenizer(
texts,
padding=True,
truncation=True,
max_length=max_length,
return_tensors="pt"
).to(self.device)
model_out = self.model(**enc)
# pooling
if self.pooling == "mean":
token_embeddings = model_out.last_hidden_state # (batch_size, seq_len, hidden_dim)
attention_mask = enc.attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * attention_mask, dim=1)
sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
pooled = sum_embeddings / sum_mask
elif self.pooling == "cls":
pooled = model_out.last_hidden_state[:, 0] # [CLS] token
else:
raise ValueError(f"Pooling mode {self.pooling} not supported")
if self.normalize:
pooled = F.normalize(pooled, p=2, dim=1)
return pooled.cpu()
def similarity(self, query_emb, doc_embs):
"""
Compute similarity between one query embedding and multiple document embeddings
using dot product or cosine. Here using dot product on normalized embeddings => cosine similarity.
"""
# ensure normalized if needed
return torch.matmul(query_emb, doc_embs.T)
-- Usage:
model_name = "google/embeddinggemma-300m" # or your HF checkpoint
embedder = HFEmbeddingModel(model_name_or_path=model_name, normalize=True, pooling="cls")
query = "Which planet is known as the Red Planet?"
documents = [
"Venus is often called Earth's twin because of its similar size and proximity.",
"Mars, known for its reddish appearance, is often referred to as the Red Planet.",
"Jupiter, the largest planet in our solar system, has a prominent red spot.",
"Saturn, famous for its rings, is sometimes mistaken for the Red Planet."
]
query_emb = embedder.embed(query)
doc_embs = embedder.embed(documents)
print(query_emb.shape, doc_embs.shape) # (1, hidden_dim), (4, hidden_dim)
sims = embedder.similarity(query_emb, doc_embs)
print(sims)
Hi @Ratar37003 ,
Welcome to Gemma models, thanks for reaching out to us. The google/embeddinggemma-300m
model can be loaded with AutoModel, however the additional functionality of Sentence Transformer can't be utilized for that we have to design custom similarity and encoding functions. I have go through the code that you have provided in the above comment however the similarity and embed functions needs to be correct as I'm seeing the 100% similarity scores for every doc sample. It's not an issue from the model as I have done the similarity checking for the same using sentence transformer. Please find the attached gist file for your reference.
Thanks.
Hi @Ratar37003 ,
I had similar issues with "pure" Transformer Embeddings.
The main difference between ST (SentenceTransformers) and the Transformer base model in this repo, is the Projection Layers (Modules) that you find in x_Dense folders.
See here for an overview of ST model structures:
https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
Below is a minimal example that I got ChatGPT to generate and I debugged and improved a bit.
FYI: At the end you find comparison with ST and I am using cosine-sim from ST itself.
import json, torch
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file as load_safetensors
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
repo_id = "google/embeddinggemma-300m" # you must accept the license on HF first
# 1) Discover module paths (robust; avoids guessing folder names)
modules_path = hf_hub_download(repo_id, filename="modules.json")
modules = json.load(open(modules_path))
# Find subfolders
xf_sub = next(m["path"] for m in modules if "Transformer" in m["type"]) # e.g. "0_Transformer" or "0_Gemma3Text"
pool_sub = next(m["path"] for m in modules if "Pooling" in m["type"])
dense_subs = [m["path"] for m in modules if "Dense" in m["type"]]
norm_exists = any("Normalize" in m["type"] for m in modules)
# 2) Load backbone (Gemma text) + tokenizer from that subfolder
tok = AutoTokenizer.from_pretrained(repo_id, subfolder=xf_sub, trust_remote_code=True)
backbone = AutoModel.from_pretrained(repo_id, subfolder=xf_sub, trust_remote_code=True)
# 3) Prepare inputs (adjust prompts to your task if needed)
texts = ["task: sentence similarity | query: best hiking shoes",
"task: sentence similarity | query: I love waterproof boots for rocky terrain"]
enc = tok(texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
out = backbone(**enc, return_dict=True) # last_hidden_state: [B,T,H]
# 4) Mean pooling (mask-aware)
token_emb = out.last_hidden_state # [B,T,H]
mask = enc["attention_mask"].unsqueeze(-1) # [B,T,1]
sent = (token_emb * mask).sum(1) / mask.sum(1).clamp(min=1e-9) # [B,H]
# 5) Apply Dense module(s) exactly as saved in the repo
def apply_dense(sent_vec, subfolder):
# read config to build the layer
cfg_p = hf_hub_download(repo_id, filename=f"{subfolder}/config.json")
cfg = json.load(open(cfg_p))
lin = torch.nn.Linear(cfg["in_features"], cfg["out_features"], bias=cfg.get("bias", True))
# load weights (safetensors preferred; fall back to bin)
try:
st = load_safetensors(hf_hub_download(repo_id, filename=f"{subfolder}/model.safetensors"))
except Exception:
st = torch.load(hf_hub_download(repo_id, filename=f"{subfolder}/pytorch_model.bin"), map_location="cpu")
if "linear.weight" in st:
st["weight"] = st.pop("linear.weight")
if "linear.bias" in st:
st["bias"] = st.pop("linear.bias")
lin.load_state_dict(st, strict=True)
# apply (activation is usually Identity; handle Tanh/ReLU if present)
sent_vec = lin(sent_vec)
act = cfg.get("activation_function", "torch.nn.modules.linear.Identity")
if "Tanh" in act: sent_vec = torch.tanh(sent_vec)
elif "ReLU" in act: sent_vec = torch.relu(sent_vec)
return sent_vec
for ds in sorted(dense_subs): # preserve order: 2_Dense, 3_Dense, ...
sent = apply_dense(sent, ds)
# 6) Optional Normalize() (L2) – many ST models (incl. EmbeddingGemma packaging) do this
embd = torch.nn.functional.normalize(sent, p=2, dim=1)
print(sent.shape) # should be [batch, 768] for the 300M model
# compare this with sentence-transformers results
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
dtype = torch.bfloat16 if device == "mps" else torch.float32
st_model = SentenceTransformer(repo_id,model_kwargs={"dtype": dtype}).to(device)
st_model.eval()
st_embd = st_model.encode(texts, convert_to_tensor=True, precision="float32")
print(torch.allclose(embd.to(device), st_embd, atol=1e-6)) # should be True
sim_matrix = cos_sim(embd.to(device), st_embd.to(device))
print(sim_matrix) # should be all 1.0 on the diagonal