---
license: apache-2.0
library_name: transformers.js
language:
- en
pipeline_tag: sentence-similarity
base_model:
- Qdrant/all_miniLM_L6_v2_with_attentions
- sentence-transformers/all-MiniLM-L6-v2
---

ONNX port of [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) adjusted to return attention weights.

This model is intended to be used for [BM42 searches](https://qdrant.tech/articles/bm42/).

> Fixes an issue with the [Qdrant version](https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions) not having the onnx folder so transformers.js cant use it.

### Usage

> Note:
> This model is supposed to be used with Qdrant. Vectors have to be configured with [Modifier.IDF](https://qdrant.tech/documentation/concepts/indexing/?q=modifier#idf-modifier).

```typescript
import { AutoTokenizer, AutoModel, TokenizerModel } from '@xenova/transformers';

documents = [
    "You should stay, study and sprint.",
    "History can only prepare us to be surprised yet again.",
]

const MODEL_ID = "bradynapier/all_miniLM_L6_v2_with_attentions_onnx"

const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, {
    revision: 'main',
})

// this has some useful utils that transforms py has in the tokenizer ...
const tokenizerModel = TokenizerModel.fromConfig(tokenizer.model.config)

const model = await AutoModel.from_pretrained(MODEL_ID, {
        quantized: false,
        revision: 'main',
});


// the types are wildy incorrect... but this should get you what you need!
```

#### Rough Outline of Getting Attentions

> This may not be the best way but the documentation is truly lacking and this does the job :-P

```typescript
/**
 * Minimal attention tensor shape we rely on.
 * Only `dims` and `data` are used (dims = [B=1, H, T, T]).
 */
type XtTensor = { dims: number[]; data: ArrayLike<number | bigint> };

/**
 * Collect attentions across layers from a model.forward(...) output.
 *
 * ⚠️ Transformers.js variation:
 * - Some builds return `{ attentions: Tensor[] }`.
 * - Others return a dict with `attention_1`, `attention_2`, ... per layer.
 *
 * @internal
 * @param out Raw dictionary from `model.forward(...)`.
 * @returns Array of attention tensors (one per layer) with dims `[1, H, T, T]`.
 */
function collectAttentions(out: Record<string, Tensor>): XtTensor[] {
  // Prefer array form if present (runtime feature; TS types don’t guarantee it).
  const anyOut = out as unknown as { attentions?: XtTensor[] };
  if (Array.isArray(anyOut.attentions)) return anyOut.attentions;

  // Otherwise gather attention_1..attention_N and sort numerically by suffix.
  const keys = Object.keys(out)
    .filter((k) => /^attention_\d+$/i.test(k))
    .sort(
      (a, b) => parseInt(a.split('_')[1], 10) - parseInt(b.split('_')[1], 10),
    );

  return keys.map((k) => out[k] as unknown as XtTensor);
}

function onesMask(n: number): Tensor {
  const data = BigInt64Array.from({ length: n }, () => 1n);
  return new Tensor('int64', data, [1, n]);
}


/**
 * Tokenization:
 * Prefer the public callable form `tokenizer(text, {...})` which returns tensors.
 * In case your wrapper only exposes a `_call` (private-ish) we fall back to it here.
 * The return includes `input_ids` and `attention_mask` tensors.
 */
const enc =
  typeof (tokenizer as typeof tokenizer._call) === 'function' ?
    // eslint-disable-next-line @typescript-eslint/await-thenable
    await (tokenizer as typeof tokenizer._call)(text, {
      add_special_tokens: true,
    })
  : tokenizer._call(text, { add_special_tokens: true }); // <-- documented hack

// Convert tensor buffers (may be BigInt) → number[] for downstream processing.
const input_ids = Array.from(
  (enc.input_ids as Tensor).data as ArrayLike<number | bigint>,
).map(Number);

/**
 * Forward pass with attentions.
 *
 * Another "crazy" bit: different Transformers.js builds expose attentions differently. We:
 * - accept `{ attentions: Tensor[] }`, or
 * - collect `attention_1, attention_2, ...` and sort them.
 * Also, `Tensor` has no `.get(...)` so we do **flat buffer indexing** with `dims`.
 */
const out = (await model.forward({
  input_ids,
  attention_mask: onesMask(input_ids.length),
  output_attentions: true,
})) as unknown as Record<string, Tensor>;

const attentions = collectAttentions(out)
```