tolga.ayan commited on
Commit
760f182
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - tr
5
+ - ar
6
+ - en
7
+ - de
8
+ - bg
9
+ - hu
10
+ - ro
11
+ - sk
12
+ - pl
13
+ - cs
14
+ - el
15
+ library_name: sentence-transformers
16
+ base_model:
17
+ - Alibaba-NLP/gte-multilingual-base
18
+ ---
19
+
20
+
21
+
22
+ # Trendyol/TY-ecomm-embed-multilingual-base-v1.2.0
23
+
24
+ Trendyol/TY-ecomm-embed-multilingual-base-v1.2.0 is a multilingual [sentence-transformers](https://www.SBERT.net) embedding model fine-tuned on e-commerce datasets, optimized for semantic similarity, search, classification, and retrieval tasks. It is based on a Sentence Transformers architecture, integrating domain-specific signals from millions of real-world queries, product descriptions, and user interactions. This model is fine-tuned over a customized model from gte-multilingual-base.
25
+
26
+ Keynotes:
27
+ * Optimized for e-commerce semantic search
28
+ * Enhanced Turkish and multilingual query understanding
29
+ * Supports query rephrasing and paraphrase mining
30
+ * Robust for product tagging and attribute extraction
31
+ * Suitable for clustering and product categorization
32
+ * High-performance in semantic textual similarity
33
+ * 384-token input support
34
+ * 768-dimensional dense vector outputs
35
+ * Built-in cosine similarity for inference
36
+
37
+
38
+ ## Model Details
39
+
40
+ ### Model Description
41
+ - **Model Type:** Sentence Transformer
42
+ <!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
43
+ - **Maximum Sequence Length:** 384 tokens
44
+ - **Output Dimensionality:** 768 dimensions
45
+ - **Similarity Function:** Cosine Similarity
46
+ - **Training Datasets:**
47
+ - Multilingual and Turkish search terms
48
+ - Turkish Instruction datasets
49
+ - Turkish Summarization datasets
50
+ - Turkish E-commerce rephrase datasets
51
+ - Turkish Question-answer pairs
52
+ - and more!
53
+ <!-- - **Language:** Unknown -->
54
+ <!-- - **License:** Unknown -->
55
+
56
+ ### Model Sources
57
+
58
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
59
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
60
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
61
+
62
+ ### Full Model Architecture
63
+
64
+ ```
65
+ SentenceTransformer(
66
+ (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: NewModel
67
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
68
+ (2): Normalize()
69
+ )
70
+ ```
71
+
72
+ ## Usage
73
+
74
+ ### Direct Usage (Sentence Transformers)
75
+
76
+ First install the Sentence Transformers library:
77
+
78
+ ```bash
79
+ pip install -U sentence-transformers
80
+ ```
81
+
82
+ Then you can load this model and run inference.
83
+ ```python
84
+ from sentence_transformers import SentenceTransformer
85
+
86
+ # Download from the 🤗 Hub
87
+ model = SentenceTransformer("sentence_transformers_model_id")
88
+ # Run inference
89
+ sentences = [
90
+ '120x190 yapıyor musunuz',
91
+ 'merhaba 120 x 180 mevcüttür',
92
+ 'Ürün stoklarımızda bulunmamaktadır',
93
+ ]
94
+ embeddings = model.encode(sentences)
95
+ print(embeddings.shape)
96
+ # [3, 768]
97
+
98
+ # Get the similarity scores for the embeddings
99
+ similarities = model.similarity(embeddings, embeddings)
100
+ print(similarities.shape)
101
+ # [3, 3]
102
+ ```
103
+
104
+ <!--
105
+ ### Direct Usage (Transformers)
106
+
107
+ <details><summary>Click to see the direct usage in Transformers</summary>
108
+
109
+ </details>
110
+ -->
111
+
112
+ <!--
113
+ ### Downstream Usage (Sentence Transformers)
114
+
115
+ You can finetune this model on your own dataset.
116
+
117
+ <details><summary>Click to expand</summary>
118
+
119
+ </details>
120
+ -->
121
+
122
+ <!--
123
+ ### Out-of-Scope Use
124
+
125
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
126
+ -->
127
+
128
+
129
+ ## Bias, Risks and Limitations
130
+
131
+ While this model is trained on e-commerce-related datasets, including multilingual and Turkish data, users should be aware of several limitations:
132
+
133
+ * **Domain bias:** Performance may degrade for content outside the e-commerce or product-related domains, such as legal, medical, or highly technical texts.
134
+
135
+ * **Language coverage:** Although multilingual data was included, majority of the dataset is created in Turkish.
136
+
137
+ * **Input length limitations:** Inputs exceeding the maximum sequence length (384 tokens) will be truncated, potentially losing critical context in long texts.
138
+
139
+ * **Spurious similarity:** Semantic similarity may incorrectly assign high similarity scores to unrelated but lexically similar or frequently co-occurring phrases in training data.
140
+
141
+
142
+ ### Recommendations
143
+
144
+ * **Human Oversight:** We recommend incorporating a human curation layer or using filters to manage and improve the quality of outputs, especially in public-facing applications. This approach can help mitigate the risk of generating objectionable content unexpectedly.
145
+ * **Application-Specific Testing:** Developers intending to use Trendyol embedding models should conduct thorough safety testing and optimization tailored to their specific applications. This is crucial, as the model’s outputs may occasionally be biased or inaccurate.
146
+ * **Responsible Development and Deployment:** It is the responsibility of developers and users of Trendyol embedding models to ensure its ethical and safe application. We urge users to be mindful of the model's limitations and to employ appropriate safeguards to prevent misuse or harmful consequences.
147
+
148
+
149
+
150
+ ## Training Details
151
+
152
+ * Loss: [<code>MatryoshkaLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#matryoshkaloss) with these parameters:
153
+ ```json
154
+ {
155
+ "loss": "CachedMultipleNegativesSymmetricRankingLoss",
156
+ "matryoshka_dims": [
157
+ 768,
158
+ 512,
159
+ 128
160
+ ],
161
+ "matryoshka_weights": [
162
+ 1,
163
+ 1,
164
+ 1
165
+ ],
166
+ "n_dims_per_step": -1
167
+ }
168
+ ```
169
+
170
+ ### Training Hyperparameters
171
+ #### Non-Default Hyperparameters
172
+
173
+ - `overwrite_output_dir`: True
174
+ - `eval_strategy`: steps
175
+ - `per_device_train_batch_size`: 2048
176
+ - `per_device_eval_batch_size`: 128
177
+ - `learning_rate`: 0.0005
178
+ - `num_train_epochs`: 1
179
+ - `warmup_ratio`: 0.01
180
+ - `fp16`: True
181
+ - `ddp_timeout`: 300000
182
+ - `batch_sampler`: no_duplicates
183
+
184
+ #### All Hyperparameters
185
+ <details><summary>Click to expand</summary>
186
+
187
+ - `overwrite_output_dir`: True
188
+ - `do_predict`: False
189
+ - `eval_strategy`: steps
190
+ - `prediction_loss_only`: True
191
+ - `per_device_train_batch_size`: 2048
192
+ - `per_device_eval_batch_size`: 128
193
+ - `per_gpu_train_batch_size`: None
194
+ - `per_gpu_eval_batch_size`: None
195
+ - `gradient_accumulation_steps`: 1
196
+ - `eval_accumulation_steps`: None
197
+ - `torch_empty_cache_steps`: None
198
+ - `learning_rate`: 0.0005
199
+ - `weight_decay`: 0.0
200
+ - `adam_beta1`: 0.9
201
+ - `adam_beta2`: 0.999
202
+ - `adam_epsilon`: 1e-08
203
+ - `max_grad_norm`: 1.0
204
+ - `num_train_epochs`: 1
205
+ - `max_steps`: -1
206
+ - `lr_scheduler_type`: linear
207
+ - `lr_scheduler_kwargs`: {}
208
+ - `warmup_ratio`: 0.01
209
+ - `warmup_steps`: 0
210
+ - `log_level`: passive
211
+ - `log_level_replica`: warning
212
+ - `log_on_each_node`: True
213
+ - `logging_nan_inf_filter`: True
214
+ - `save_safetensors`: True
215
+ - `save_on_each_node`: False
216
+ - `save_only_model`: False
217
+ - `restore_callback_states_from_checkpoint`: False
218
+ - `no_cuda`: False
219
+ - `use_cpu`: False
220
+ - `use_mps_device`: False
221
+ - `seed`: 42
222
+ - `data_seed`: None
223
+ - `jit_mode_eval`: False
224
+ - `use_ipex`: False
225
+ - `bf16`: False
226
+ - `fp16`: True
227
+ - `fp16_opt_level`: O1
228
+ - `half_precision_backend`: auto
229
+ - `bf16_full_eval`: False
230
+ - `fp16_full_eval`: False
231
+ - `tf32`: None
232
+ - `local_rank`: 0
233
+ - `ddp_backend`: None
234
+ - `tpu_num_cores`: None
235
+ - `tpu_metrics_debug`: False
236
+ - `debug`: []
237
+ - `dataloader_drop_last`: True
238
+ - `dataloader_num_workers`: 0
239
+ - `dataloader_prefetch_factor`: None
240
+ - `past_index`: -1
241
+ - `disable_tqdm`: False
242
+ - `remove_unused_columns`: True
243
+ - `label_names`: None
244
+ - `load_best_model_at_end`: False
245
+ - `ignore_data_skip`: False
246
+ - `fsdp`: []
247
+ - `fsdp_min_num_params`: 0
248
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
249
+ - `fsdp_transformer_layer_cls_to_wrap`: None
250
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
251
+ - `deepspeed`: None
252
+ - `label_smoothing_factor`: 0.0
253
+ - `optim`: adamw_torch
254
+ - `optim_args`: None
255
+ - `adafactor`: False
256
+ - `group_by_length`: False
257
+ - `length_column_name`: length
258
+ - `ddp_find_unused_parameters`: None
259
+ - `ddp_bucket_cap_mb`: None
260
+ - `ddp_broadcast_buffers`: False
261
+ - `dataloader_pin_memory`: True
262
+ - `dataloader_persistent_workers`: False
263
+ - `skip_memory_metrics`: True
264
+ - `use_legacy_prediction_loop`: False
265
+ - `push_to_hub`: False
266
+ - `resume_from_checkpoint`: None
267
+ - `hub_model_id`: None
268
+ - `hub_strategy`: every_save
269
+ - `hub_private_repo`: None
270
+ - `hub_always_push`: False
271
+ - `gradient_checkpointing`: False
272
+ - `gradient_checkpointing_kwargs`: None
273
+ - `include_inputs_for_metrics`: False
274
+ - `include_for_metrics`: []
275
+ - `eval_do_concat_batches`: True
276
+ - `fp16_backend`: auto
277
+ - `push_to_hub_model_id`: None
278
+ - `push_to_hub_organization`: None
279
+ - `mp_parameters`:
280
+ - `auto_find_batch_size`: False
281
+ - `full_determinism`: False
282
+ - `torchdynamo`: None
283
+ - `ray_scope`: last
284
+ - `ddp_timeout`: 300000
285
+ - `torch_compile`: False
286
+ - `torch_compile_backend`: None
287
+ - `torch_compile_mode`: None
288
+ - `dispatch_batches`: None
289
+ - `split_batches`: None
290
+ - `include_tokens_per_second`: False
291
+ - `include_num_input_tokens_seen`: False
292
+ - `neftune_noise_alpha`: None
293
+ - `optim_target_modules`: None
294
+ - `batch_eval_metrics`: False
295
+ - `eval_on_start`: False
296
+ - `use_liger_kernel`: False
297
+ - `eval_use_gather_object`: False
298
+ - `average_tokens_across_devices`: False
299
+ - `prompts`: None
300
+ - `batch_sampler`: no_duplicates
301
+ - `multi_dataset_batch_sampler`: proportional
302
+
303
+ </details>
304
+
305
+ ### Framework Versions
306
+ - Python: 3.11.11
307
+ - Sentence Transformers: 3.4.1
308
+ - Transformers: 4.48.1
309
+ - PyTorch: 2.5.1+cu124
310
+ - Accelerate: 1.5.1
311
+ - Datasets: 2.21.0
312
+ - Tokenizers: 0.21.1
313
+
314
+ ## Citation
315
+
316
+ ### BibTeX
317
+
318
+ #### Sentence Transformers
319
+ ```bibtex
320
+ @inproceedings{reimers-2019-sentence-bert,
321
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
322
+ author = "Reimers, Nils and Gurevych, Iryna",
323
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
324
+ month = "11",
325
+ year = "2019",
326
+ publisher = "Association for Computational Linguistics",
327
+ url = "https://arxiv.org/abs/1908.10084",
328
+ }
329
+ ```
330
+
331
+ #### MatryoshkaLoss
332
+ ```bibtex
333
+ @misc{kusupati2024matryoshka,
334
+ title={Matryoshka Representation Learning},
335
+ author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
336
+ year={2024},
337
+ eprint={2205.13147},
338
+ archivePrefix={arXiv},
339
+ primaryClass={cs.LG}
340
+ }
341
+ ```
342
+
343
+ <!--
344
+ ## Glossary
345
+
346
+ *Clearly define terms in order to be accessible across audiences.*
347
+ -->
348
+
349
+ <!--
350
+ ## Model Card Authors
351
+
352
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
353
+ -->
354
+
355
+ <!--
356
+ ## Model Card Contact
357
+
358
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
359
+ -->
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TY-ecomm-embed-multilingual-base-v1.2.0",
3
+ "architectures": [
4
+ "NewModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
9
+ "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
10
+ "AutoModelForMaskedLM": "Alibaba-NLP/new-impl--modeling.NewForMaskedLM",
11
+ "AutoModelForMultipleChoice": "Alibaba-NLP/new-impl--modeling.NewForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "Alibaba-NLP/new-impl--modeling.NewForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "Alibaba-NLP/new-impl--modeling.NewForSequenceClassification",
14
+ "AutoModelForTokenClassification": "Alibaba-NLP/new-impl--modeling.NewForTokenClassification"
15
+ },
16
+ "classifier_dropout": 0.0,
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 768,
20
+ "id2label": {
21
+ "0": "LABEL_0"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "LABEL_0": 0
27
+ },
28
+ "layer_norm_eps": 1e-12,
29
+ "layer_norm_type": "layer_norm",
30
+ "logn_attention_clip1": false,
31
+ "logn_attention_scale": false,
32
+ "max_position_embeddings": 8192,
33
+ "model_type": "new",
34
+ "num_attention_heads": 12,
35
+ "num_hidden_layers": 12,
36
+ "pack_qkv": true,
37
+ "pad_token_id": 1,
38
+ "position_embedding_type": "rope",
39
+ "rope_scaling": {
40
+ "factor": 8.0,
41
+ "type": "ntk"
42
+ },
43
+ "rope_theta": 20000,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.1",
46
+ "type_vocab_size": 1,
47
+ "unpad_inputs": false,
48
+ "use_memory_efficient_attention": false,
49
+ "vocab_size": 250048
50
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.4.1",
4
+ "transformers": "4.48.1",
5
+ "pytorch": "2.5.1+cu124"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
configuration.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The GTE Team Authors and Alibaba Group.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ NEW model configuration"""
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+
23
+ class NewConfig(PretrainedConfig):
24
+ r"""
25
+ This is the configuration class to store the configuration of a [`NewModel`] or a [`TFNewModel`]. It is used to
26
+ instantiate a NEW model according to the specified arguments, defining the model architecture. Instantiating a
27
+ configuration with the defaults will yield a similar configuration to that of the NEW
28
+ [izhx/new-base-en](https://huggingface.co/izhx/new-base-en) architecture.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+
34
+ Args:
35
+ vocab_size (`int`, *optional*, defaults to 30522):
36
+ Vocabulary size of the NEW model. Defines the number of different tokens that can be represented by the
37
+ `inputs_ids` passed when calling [`NewModel`] or [`TFNewModel`].
38
+ hidden_size (`int`, *optional*, defaults to 768):
39
+ Dimensionality of the encoder layers and the pooler layer.
40
+ num_hidden_layers (`int`, *optional*, defaults to 12):
41
+ Number of hidden layers in the Transformer encoder.
42
+ num_attention_heads (`int`, *optional*, defaults to 12):
43
+ Number of attention heads for each attention layer in the Transformer encoder.
44
+ intermediate_size (`int`, *optional*, defaults to 3072):
45
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
46
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
47
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
48
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
49
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
50
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
51
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
52
+ The dropout ratio for the attention probabilities.
53
+ max_position_embeddings (`int`, *optional*, defaults to 512):
54
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
55
+ just in case (e.g., 512 or 1024 or 2048).
56
+ type_vocab_size (`int`, *optional*, defaults to 2):
57
+ The vocabulary size of the `token_type_ids` passed when calling [`NewModel`] or [`TFNewModel`].
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
61
+ The epsilon used by the layer normalization layers.
62
+ position_embedding_type (`str`, *optional*, defaults to `"rope"`):
63
+ Type of position embedding. Choose one of `"absolute"`, `"rope"`.
64
+ rope_theta (`float`, *optional*, defaults to 10000.0):
65
+ The base period of the RoPE embeddings.
66
+ rope_scaling (`Dict`, *optional*):
67
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
68
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
69
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
70
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
71
+ these scaling strategies behave:
72
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
73
+ experimental feature, subject to breaking API changes in future versions.
74
+ classifier_dropout (`float`, *optional*):
75
+ The dropout ratio for the classification head.
76
+
77
+ Examples:
78
+
79
+ ```python
80
+ >>> from transformers import NewConfig, NewModel
81
+
82
+ >>> # Initializing a NEW izhx/new-base-en style configuration
83
+ >>> configuration = NewConfig()
84
+
85
+ >>> # Initializing a model (with random weights) from the izhx/new-base-en style configuration
86
+ >>> model = NewModel(configuration)
87
+
88
+ >>> # Accessing the model configuration
89
+ >>> configuration = model.config
90
+ ```"""
91
+
92
+ model_type = "new"
93
+
94
+ def __init__(
95
+ self,
96
+ vocab_size=30528,
97
+ hidden_size=768,
98
+ num_hidden_layers=12,
99
+ num_attention_heads=12,
100
+ intermediate_size=3072,
101
+ hidden_act="gelu",
102
+ hidden_dropout_prob=0.1,
103
+ attention_probs_dropout_prob=0.0,
104
+ max_position_embeddings=2048,
105
+ type_vocab_size=1,
106
+ initializer_range=0.02,
107
+ layer_norm_type='layer_norm',
108
+ layer_norm_eps=1e-12,
109
+ # pad_token_id=0,
110
+ position_embedding_type="rope",
111
+ rope_theta=10000.0,
112
+ rope_scaling=None,
113
+ classifier_dropout=None,
114
+ pack_qkv=True,
115
+ unpad_inputs=False,
116
+ use_memory_efficient_attention=False,
117
+ logn_attention_scale=False,
118
+ logn_attention_clip1=False,
119
+ **kwargs,
120
+ ):
121
+ super().__init__(**kwargs)
122
+
123
+ self.vocab_size = vocab_size
124
+ self.hidden_size = hidden_size
125
+ self.num_hidden_layers = num_hidden_layers
126
+ self.num_attention_heads = num_attention_heads
127
+ self.hidden_act = hidden_act
128
+ self.intermediate_size = intermediate_size
129
+ self.hidden_dropout_prob = hidden_dropout_prob
130
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
131
+ self.max_position_embeddings = max_position_embeddings
132
+ self.type_vocab_size = type_vocab_size
133
+ self.initializer_range = initializer_range
134
+ self.layer_norm_type = layer_norm_type
135
+ self.layer_norm_eps = layer_norm_eps
136
+ self.position_embedding_type = position_embedding_type
137
+ self.rope_theta = rope_theta
138
+ self.rope_scaling = rope_scaling
139
+ self.classifier_dropout = classifier_dropout
140
+
141
+ self.pack_qkv = pack_qkv
142
+ self.unpad_inputs = unpad_inputs
143
+ self.use_memory_efficient_attention = use_memory_efficient_attention
144
+ self.logn_attention_scale = logn_attention_scale
145
+ self.logn_attention_clip1 = logn_attention_clip1
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baa64461b53a5a1014cfa8af5ab58ee4075216a71332f5df70163969d3b55332
3
+ size 1221487872
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 384,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:344e3feb078b84a4c30158f5b85f3dbaeee7a1d2689e0a0a9ebb8a0d63a8faf7
3
+ size 17082987
tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 512,
51
+ "model_max_length": 512,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "stride": 0,
58
+ "tokenizer_class": "XLMRobertaTokenizer",
59
+ "truncation_side": "right",
60
+ "truncation_strategy": "longest_first",
61
+ "unk_token": "<unk>"
62
+ }