bradynapier's picture
Update README.md
dc8a6bf verified
---
license: apache-2.0
library_name: transformers.js
language:
- en
pipeline_tag: sentence-similarity
base_model:
- Qdrant/all_miniLM_L6_v2_with_attentions
- sentence-transformers/all-MiniLM-L6-v2
---
ONNX port of [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) adjusted to return attention weights.
This model is intended to be used for [BM42 searches](https://qdrant.tech/articles/bm42/).
> Fixes an issue with the [Qdrant version](https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions) not having the onnx folder so transformers.js cant use it.
### Usage
> Note:
> This model is supposed to be used with Qdrant. Vectors have to be configured with [Modifier.IDF](https://qdrant.tech/documentation/concepts/indexing/?q=modifier#idf-modifier).
```typescript
import { AutoTokenizer, AutoModel, TokenizerModel } from '@xenova/transformers';
documents = [
"You should stay, study and sprint.",
"History can only prepare us to be surprised yet again.",
]
const MODEL_ID = "bradynapier/all_miniLM_L6_v2_with_attentions_onnx"
const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, {
revision: 'main',
})
// this has some useful utils that transforms py has in the tokenizer ...
const tokenizerModel = TokenizerModel.fromConfig(tokenizer.model.config)
const model = await AutoModel.from_pretrained(MODEL_ID, {
quantized: false,
revision: 'main',
});
// the types are wildy incorrect... but this should get you what you need!
```
#### Rough Outline of Getting Attentions
> This may not be the best way but the documentation is truly lacking and this does the job :-P
```typescript
/**
* Minimal attention tensor shape we rely on.
* Only `dims` and `data` are used (dims = [B=1, H, T, T]).
*/
type XtTensor = { dims: number[]; data: ArrayLike<number | bigint> };
/**
* Collect attentions across layers from a model.forward(...) output.
*
* ⚠️ Transformers.js variation:
* - Some builds return `{ attentions: Tensor[] }`.
* - Others return a dict with `attention_1`, `attention_2`, ... per layer.
*
* @internal
* @param out Raw dictionary from `model.forward(...)`.
* @returns Array of attention tensors (one per layer) with dims `[1, H, T, T]`.
*/
function collectAttentions(out: Record<string, Tensor>): XtTensor[] {
// Prefer array form if present (runtime feature; TS types don’t guarantee it).
const anyOut = out as unknown as { attentions?: XtTensor[] };
if (Array.isArray(anyOut.attentions)) return anyOut.attentions;
// Otherwise gather attention_1..attention_N and sort numerically by suffix.
const keys = Object.keys(out)
.filter((k) => /^attention_\d+$/i.test(k))
.sort(
(a, b) => parseInt(a.split('_')[1], 10) - parseInt(b.split('_')[1], 10),
);
return keys.map((k) => out[k] as unknown as XtTensor);
}
function onesMask(n: number): Tensor {
const data = BigInt64Array.from({ length: n }, () => 1n);
return new Tensor('int64', data, [1, n]);
}
/**
* Tokenization:
* Prefer the public callable form `tokenizer(text, {...})` which returns tensors.
* In case your wrapper only exposes a `_call` (private-ish) we fall back to it here.
* The return includes `input_ids` and `attention_mask` tensors.
*/
const enc =
typeof (tokenizer as typeof tokenizer._call) === 'function' ?
// eslint-disable-next-line @typescript-eslint/await-thenable
await (tokenizer as typeof tokenizer._call)(text, {
add_special_tokens: true,
})
: tokenizer._call(text, { add_special_tokens: true }); // <-- documented hack
// Convert tensor buffers (may be BigInt) → number[] for downstream processing.
const input_ids = Array.from(
(enc.input_ids as Tensor).data as ArrayLike<number | bigint>,
).map(Number);
/**
* Forward pass with attentions.
*
* Another "crazy" bit: different Transformers.js builds expose attentions differently. We:
* - accept `{ attentions: Tensor[] }`, or
* - collect `attention_1, attention_2, ...` and sort them.
* Also, `Tensor` has no `.get(...)` so we do **flat buffer indexing** with `dims`.
*/
const out = (await model.forward({
input_ids,
attention_mask: onesMask(input_ids.length),
output_attentions: true,
})) as unknown as Record<string, Tensor>;
const attentions = collectAttentions(out)
```