Upload folder using huggingface_hub

Files changed (8) hide show

README.md ADDED Viewed

+---
+license: apache-2.0
+language:
+- en
+pipeline_tag: sentence-similarity
+---
+ONNX port of [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) adjusted to return attention weights.
+This model is intended to be used for [BM42 searches](https://qdrant.tech/articles/bm42/).
+### Usage
+Here's an example of performing inference using the model with [FastEmbed](https://github.com/qdrant/fastembed).
+> Note:
+This model is supposed to be used with Qdrant. Vectors have to be configured with [Modifier.IDF](https://qdrant.tech/documentation/concepts/indexing/?q=modifier#idf-modifier).
+```py
+from fastembed import SparseTextEmbedding
+documents = [
+    "You should stay, study and sprint.",
+    "History can only prepare us to be surprised yet again.",
+]
+model = SparseTextEmbedding(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions")
+embeddings = list(model.embed(documents))
+# [
+#     SparseEmbedding(values=array([0.26399775, 0.24662513, 0.47077307]),
+#                     indices=array([1881538586, 150760872, 1932363795])),
+#     SparseEmbedding(values=array(
+#         [0.38320042, 0.25453135, 0.18017513, 0.30432631, 0.1373556]),
+#                     indices=array([
+#                         733618285, 1849833631, 1008800696, 2090661150,
+#                         1117393019
+#                     ]))
+# ]
+```

config.json ADDED Viewed

+{
+  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "output_attentions": true,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

onnx/model.onnx ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:368eb595d71c2c5087a200a719955a915ab788c65d487632a14de7a32613ee89
+size 90985592

special_tokens_map.json ADDED Viewed

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

stopwords.txt ADDED Viewed

+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+you're
+you've
+you'll
+you'd
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+she's
+her
+hers
+herself
+it
+it's
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+that'll
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+don't
+should
+should've
+now
+d
+ll
+m
+o
+re
+ve
+y
+ain
+aren
+aren't
+couldn
+couldn't
+didn
+didn't
+doesn
+doesn't
+hadn
+hadn't
+hasn
+hasn't
+haven
+haven't
+isn
+isn't
+ma
+mightn
+mightn't
+mustn
+mustn't
+needn
+needn't
+shan
+shan't
+shouldn
+shouldn't
+wasn
+wasn't
+weren
+weren't
+won
+won't
+wouldn
+wouldn't

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff