Upload folder using huggingface_hub
Browse files- 1_Pooling/config.json +3 -3
- README.md +65 -74
- config.json +18 -11
- model.safetensors +2 -2
- sentence_bert_config.json +2 -2
- special_tokens_map.json +5 -19
- tokenizer.json +0 -0
- tokenizer_config.json +19 -34
- vocab.txt +0 -5
1_Pooling/config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"word_embedding_dimension":
|
| 3 |
-
"pooling_mode_cls_token":
|
| 4 |
-
"pooling_mode_mean_tokens":
|
| 5 |
"pooling_mode_max_tokens": false,
|
| 6 |
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
"pooling_mode_weightedmean_tokens": false,
|
|
|
|
| 1 |
{
|
| 2 |
+
"word_embedding_dimension": 384,
|
| 3 |
+
"pooling_mode_cls_token": true,
|
| 4 |
+
"pooling_mode_mean_tokens": false,
|
| 5 |
"pooling_mode_max_tokens": false,
|
| 6 |
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
"pooling_mode_weightedmean_tokens": false,
|
README.md
CHANGED
|
@@ -4,72 +4,63 @@ tags:
|
|
| 4 |
- sentence-similarity
|
| 5 |
- feature-extraction
|
| 6 |
- generated_from_trainer
|
| 7 |
-
- dataset_size:
|
| 8 |
-
- loss:
|
| 9 |
-
base_model:
|
| 10 |
widget:
|
| 11 |
-
- source_sentence: How
|
| 12 |
sentences:
|
| 13 |
-
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"
|
| 14 |
-
"
|
| 15 |
-
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"
|
|
|
|
|
|
|
| 16 |
"portfolio"}]'
|
| 17 |
-
- '[{"
|
| 18 |
-
|
|
|
|
| 19 |
sentences:
|
| 20 |
-
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"
|
| 21 |
-
"portfolio"}
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
- '[{"get_dividend_history([''<TICKER>''],None)": "<TICKER>_dividend_history"}]'
|
| 27 |
-
- source_sentence: How will the Cotation Assistée en Continu affect my portfolio?
|
| 28 |
sentences:
|
| 29 |
-
- '[{"
|
| 30 |
-
|
| 31 |
-
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test(''portfolio'',''cac_40'',None,''up'')":
|
| 32 |
"stress_test"}]'
|
| 33 |
-
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test(''portfolio'',''
|
| 34 |
"stress_test"}]'
|
| 35 |
-
- source_sentence: what
|
| 36 |
sentences:
|
| 37 |
-
- '[{"
|
| 38 |
-
|
| 39 |
-
- '[{"
|
| 40 |
-
|
| 41 |
-
- source_sentence:
|
| 42 |
sentences:
|
| 43 |
-
- '[{"
|
| 44 |
-
"
|
| 45 |
-
- '[{"
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
"portfolio"}, {"
|
| 49 |
-
{"aggregate(''portfolio'',''ticker'',''expo_percentage_<TICKER>'',''sum'',None)":
|
| 50 |
-
"total_expo_percentage_<TICKER>"},{"analyze_impact(''portfolio'',''<TICKER>'',''sell'')":
|
| 51 |
-
"<TICKER>_sell_impact"}]'
|
| 52 |
-
- '[{"get_portfolio([''marketValue''],True,None)": "portfolio"}, {"get_attribute(''portfolio'',[''<TICKER1>'',''<TICKER2>''],''<DATES>'')":
|
| 53 |
-
"portfolio"}, {"calculate(''portfolio'',[''marketValue'', ''<TICKER1>''],''multiply'',''expo_<TICKER1>'')":
|
| 54 |
-
"portfolio"}, {"calculate(''portfolio'',[''marketValue'', ''<TICKER2>''],''multiply'',''expo_<TICKER2>'')":
|
| 55 |
-
"portfolio"}, {"sort(''portfolio'',''expo_<TICKER1>'',''desc'')": "portfolio"},
|
| 56 |
-
{"aggregate(''portfolio'',''ticker'',''expo_<TICKER1>'',''sum'',None)": "port_expo_<TICKER1>"},
|
| 57 |
-
{"aggregate(''portfolio'',''ticker'',''expo_<TICKER2>'',''sum'',None)": "port_expo_<TICKER2>"}]'
|
| 58 |
pipeline_tag: sentence-similarity
|
| 59 |
library_name: sentence-transformers
|
| 60 |
---
|
| 61 |
|
| 62 |
-
# SentenceTransformer based on
|
| 63 |
|
| 64 |
-
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [
|
| 65 |
|
| 66 |
## Model Details
|
| 67 |
|
| 68 |
### Model Description
|
| 69 |
- **Model Type:** Sentence Transformer
|
| 70 |
-
- **Base model:** [
|
| 71 |
-
- **Maximum Sequence Length:**
|
| 72 |
-
- **Output Dimensionality:**
|
| 73 |
- **Similarity Function:** Cosine Similarity
|
| 74 |
<!-- - **Training Dataset:** Unknown -->
|
| 75 |
<!-- - **Language:** Unknown -->
|
|
@@ -85,8 +76,8 @@ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [s
|
|
| 85 |
|
| 86 |
```
|
| 87 |
SentenceTransformer(
|
| 88 |
-
(0): Transformer({'max_seq_length':
|
| 89 |
-
(1): Pooling({'word_embedding_dimension':
|
| 90 |
(2): Normalize()
|
| 91 |
)
|
| 92 |
```
|
|
@@ -109,13 +100,13 @@ from sentence_transformers import SentenceTransformer
|
|
| 109 |
model = SentenceTransformer("sentence_transformers_model_id")
|
| 110 |
# Run inference
|
| 111 |
sentences = [
|
| 112 |
-
|
| 113 |
-
'[{"
|
| 114 |
-
'[{"get_portfolio(
|
| 115 |
]
|
| 116 |
embeddings = model.encode(sentences)
|
| 117 |
print(embeddings.shape)
|
| 118 |
-
# [3,
|
| 119 |
|
| 120 |
# Get the similarity scores for the embeddings
|
| 121 |
similarities = model.similarity(embeddings, embeddings)
|
|
@@ -165,24 +156,24 @@ You can finetune this model on your own dataset.
|
|
| 165 |
|
| 166 |
#### Unnamed Dataset
|
| 167 |
|
| 168 |
-
* Size: 1,
|
| 169 |
-
* Columns: <code>sentence_0</code> and <code>
|
| 170 |
* Approximate statistics based on the first 1000 samples:
|
| 171 |
-
| | sentence_0 | sentence_1 |
|
| 172 |
-
|
| 173 |
-
| type | string | string |
|
| 174 |
-
| details | <ul><li>min: 4 tokens</li><li>mean: 12.
|
| 175 |
* Samples:
|
| 176 |
-
| sentence_0
|
| 177 |
-
|
| 178 |
-
| <code>
|
| 179 |
-
| <code>What
|
| 180 |
-
| <code>
|
| 181 |
-
* Loss: [<code>
|
| 182 |
```json
|
| 183 |
{
|
| 184 |
-
"
|
| 185 |
-
"
|
| 186 |
}
|
| 187 |
```
|
| 188 |
|
|
@@ -319,7 +310,7 @@ You can finetune this model on your own dataset.
|
|
| 319 |
### Training Logs
|
| 320 |
| Epoch | Step | Training Loss |
|
| 321 |
|:------:|:----:|:-------------:|
|
| 322 |
-
| 8.
|
| 323 |
|
| 324 |
|
| 325 |
### Framework Versions
|
|
@@ -348,15 +339,15 @@ You can finetune this model on your own dataset.
|
|
| 348 |
}
|
| 349 |
```
|
| 350 |
|
| 351 |
-
####
|
| 352 |
```bibtex
|
| 353 |
-
@misc{
|
| 354 |
-
title={
|
| 355 |
-
author={
|
| 356 |
year={2017},
|
| 357 |
-
eprint={
|
| 358 |
archivePrefix={arXiv},
|
| 359 |
-
primaryClass={cs.
|
| 360 |
}
|
| 361 |
```
|
| 362 |
|
|
|
|
| 4 |
- sentence-similarity
|
| 5 |
- feature-extraction
|
| 6 |
- generated_from_trainer
|
| 7 |
+
- dataset_size:1798
|
| 8 |
+
- loss:TripletLoss
|
| 9 |
+
base_model: BAAI/bge-small-en-v1.5
|
| 10 |
widget:
|
| 11 |
+
- source_sentence: How will the NIKKEI 225 affect my portfolio
|
| 12 |
sentences:
|
| 13 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test(''portfolio'',''nikkei_225'',None,''up'')":
|
| 14 |
+
"stress_test"}]'
|
| 15 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"get_attribute(''portfolio'',[''dividend
|
| 16 |
+
yield''],''<DATES>'')": "portfolio"}, {"calculate(''portfolio'',[''dividend yield'',
|
| 17 |
+
''marketValue''],''multiply'',''div_income'')": "portfolio"}, {"sort(''portfolio'',''div_income'',''desc'')":
|
| 18 |
"portfolio"}]'
|
| 19 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test(''portfolio'',''nikkei_225'',None,None)":
|
| 20 |
+
"stress_test"}]'
|
| 21 |
+
- source_sentence: What’s the [DATES] trend of the [A_SECTOR] sector
|
| 22 |
sentences:
|
| 23 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"get_attribute(''portfolio'',[''<A_THEME>'',
|
| 24 |
+
''risk''],''<DATES>'')": "portfolio"}, {"filter(''portfolio'',''<A_THEME>'',''>'',''0.01'')":
|
| 25 |
+
"portfolio"}, {"sort(''portfolio'',''risk'',''asc'')": "portfolio"}]'
|
| 26 |
+
- '[{"get_attribute([''<A_SECTOR>''],[''returns''],''<DATES>'')":"sector_returns"}]'
|
| 27 |
+
- '[{"get_news_articles(None,None,[''<A_SECTOR>''],''<DATES>'')": "news_data"}]'
|
| 28 |
+
- source_sentence: How will rising gold commodities affect my portfolio
|
|
|
|
|
|
|
| 29 |
sentences:
|
| 30 |
+
- '[{"get_attribute([''<TICKER>''],[''returns''],''<DATES>'')":"<TICKER>_returns"}]'
|
| 31 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test(''portfolio'',''gold'',None,None)":
|
|
|
|
| 32 |
"stress_test"}]'
|
| 33 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test(''portfolio'',''gold'',None,''up'')":
|
| 34 |
"stress_test"}]'
|
| 35 |
+
- source_sentence: what percent of my account is in [AN_ASSET_TYPE]
|
| 36 |
sentences:
|
| 37 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"}, {"factor_contribution(''portfolio'',''<DATES>'',''asset_class'',''<AN_ASSET_TYPE>'',''portfolio'')":
|
| 38 |
+
"portfolio"}]'
|
| 39 |
+
- '[{"get_news_articles(None,None,[''<A_SECTOR>''],''<DATES>'')": "news_data"}]'
|
| 40 |
+
- '[{"get_attribute([''<TICKER>''],[''<AN_ASSET_TYPE>''],''<DATES>'')":"<TICKER>_data"}]'
|
| 41 |
+
- source_sentence: Can I get a performance check-in
|
| 42 |
sentences:
|
| 43 |
+
- '[{"search(''query'', ''match_type'', ''<TICKER>'')": "search_results"},{"compare([[''<TICKER>''],''search_results''],
|
| 44 |
+
[''yield''], None)": "comparison_data"}]'
|
| 45 |
+
- '[{"get_portfolio(None, True, None)": "portfolio"}, {"get_attribute(''portfolio'',[''gains''],''<DATES>'')":
|
| 46 |
+
"portfolio"}, {"sort(''portfolio'',''gains'',''desc'')": "portfolio"}]'
|
| 47 |
+
- '[{"get_portfolio(None,True,None)": "portfolio"},{"factor_contribution(''portfolio'',''<DATES>'',''security'',''<TICKER>'',''returns'')}":
|
| 48 |
+
"portfolio"}, {"get_attribute([''<TICKER>''],[''returns''],''<DATES>'')": "returns_<TICKER>"}]'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
pipeline_tag: sentence-similarity
|
| 50 |
library_name: sentence-transformers
|
| 51 |
---
|
| 52 |
|
| 53 |
+
# SentenceTransformer based on BAAI/bge-small-en-v1.5
|
| 54 |
|
| 55 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5). It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
| 56 |
|
| 57 |
## Model Details
|
| 58 |
|
| 59 |
### Model Description
|
| 60 |
- **Model Type:** Sentence Transformer
|
| 61 |
+
- **Base model:** [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <!-- at revision 5c38ec7c405ec4b44b94cc5a9bb96e735b38267a -->
|
| 62 |
+
- **Maximum Sequence Length:** 512 tokens
|
| 63 |
+
- **Output Dimensionality:** 384 dimensions
|
| 64 |
- **Similarity Function:** Cosine Similarity
|
| 65 |
<!-- - **Training Dataset:** Unknown -->
|
| 66 |
<!-- - **Language:** Unknown -->
|
|
|
|
| 76 |
|
| 77 |
```
|
| 78 |
SentenceTransformer(
|
| 79 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel
|
| 80 |
+
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
| 81 |
(2): Normalize()
|
| 82 |
)
|
| 83 |
```
|
|
|
|
| 100 |
model = SentenceTransformer("sentence_transformers_model_id")
|
| 101 |
# Run inference
|
| 102 |
sentences = [
|
| 103 |
+
'Can I get a performance check-in',
|
| 104 |
+
'[{"get_portfolio(None, True, None)": "portfolio"}, {"get_attribute(\'portfolio\',[\'gains\'],\'<DATES>\')": "portfolio"}, {"sort(\'portfolio\',\'gains\',\'desc\')": "portfolio"}]',
|
| 105 |
+
'[{"get_portfolio(None,True,None)": "portfolio"},{"factor_contribution(\'portfolio\',\'<DATES>\',\'security\',\'<TICKER>\',\'returns\')}": "portfolio"}, {"get_attribute([\'<TICKER>\'],[\'returns\'],\'<DATES>\')": "returns_<TICKER>"}]',
|
| 106 |
]
|
| 107 |
embeddings = model.encode(sentences)
|
| 108 |
print(embeddings.shape)
|
| 109 |
+
# [3, 384]
|
| 110 |
|
| 111 |
# Get the similarity scores for the embeddings
|
| 112 |
similarities = model.similarity(embeddings, embeddings)
|
|
|
|
| 156 |
|
| 157 |
#### Unnamed Dataset
|
| 158 |
|
| 159 |
+
* Size: 1,798 training samples
|
| 160 |
+
* Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>sentence_2</code>
|
| 161 |
* Approximate statistics based on the first 1000 samples:
|
| 162 |
+
| | sentence_0 | sentence_1 | sentence_2 |
|
| 163 |
+
|:--------|:----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|
| 164 |
+
| type | string | string | string |
|
| 165 |
+
| details | <ul><li>min: 4 tokens</li><li>mean: 12.37 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 20 tokens</li><li>mean: 71.59 tokens</li><li>max: 206 tokens</li></ul> | <ul><li>min: 20 tokens</li><li>mean: 73.42 tokens</li><li>max: 229 tokens</li></ul> |
|
| 166 |
* Samples:
|
| 167 |
+
| sentence_0 | sentence_1 | sentence_2 |
|
| 168 |
+
|:---------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 169 |
+
| <code>How could changes in the emerging markets index (IEMG) affect my investment portfolio</code> | <code>[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test('portfolio','iemg',None,None)": "stress_test"}]</code> | <code>[{"get_portfolio(None,True,None)": "portfolio"}, {"stress_test('portfolio','iemg',None,'up')": "stress_test"}]</code> |
|
| 170 |
+
| <code>What role has the volatility factor played in my overall returns</code> | <code>[{"get_portfolio(None,True,None)": "portfolio"}, {"factor_contribution('portfolio','<DATES>','factor','volatility','returns')": "portfolio"}]</code> | <code>[{"get_portfolio(None,True,None)": "portfolio"}, {"factor_contribution('portfolio','<DATES>','factor','volatility','portfolio')": "portfolio"}]</code> |
|
| 171 |
+
| <code>Is my portfolio overexposed to [A_REGION] country exposure</code> | <code>[{"get_portfolio(None,True,None)": "portfolio"}, {"factor_contribution('portfolio','<DATES>','region','<A_REGION>','portfolio')": "portfolio"}]</code> | <code>[{"get_portfolio(None,True,None)": "portfolio"}, {"factor_contribution('portfolio','<DATES>','theme','<A_THEME>','portfolio')": "portfolio"}]</code> |
|
| 172 |
+
* Loss: [<code>TripletLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#tripletloss) with these parameters:
|
| 173 |
```json
|
| 174 |
{
|
| 175 |
+
"distance_metric": "TripletDistanceMetric.EUCLIDEAN",
|
| 176 |
+
"triplet_margin": 0.3
|
| 177 |
}
|
| 178 |
```
|
| 179 |
|
|
|
|
| 310 |
### Training Logs
|
| 311 |
| Epoch | Step | Training Loss |
|
| 312 |
|:------:|:----:|:-------------:|
|
| 313 |
+
| 8.7719 | 500 | 0.0657 |
|
| 314 |
|
| 315 |
|
| 316 |
### Framework Versions
|
|
|
|
| 339 |
}
|
| 340 |
```
|
| 341 |
|
| 342 |
+
#### TripletLoss
|
| 343 |
```bibtex
|
| 344 |
+
@misc{hermans2017defense,
|
| 345 |
+
title={In Defense of the Triplet Loss for Person Re-Identification},
|
| 346 |
+
author={Alexander Hermans and Lucas Beyer and Bastian Leibe},
|
| 347 |
year={2017},
|
| 348 |
+
eprint={1703.07737},
|
| 349 |
archivePrefix={arXiv},
|
| 350 |
+
primaryClass={cs.CV}
|
| 351 |
}
|
| 352 |
```
|
| 353 |
|
config.json
CHANGED
|
@@ -1,23 +1,30 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.1,
|
| 6 |
-
"
|
| 7 |
-
"eos_token_id": 2,
|
| 8 |
"hidden_act": "gelu",
|
| 9 |
"hidden_dropout_prob": 0.1,
|
| 10 |
-
"hidden_size":
|
|
|
|
|
|
|
|
|
|
| 11 |
"initializer_range": 0.02,
|
| 12 |
-
"intermediate_size":
|
| 13 |
-
"
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
"num_attention_heads": 12,
|
| 17 |
"num_hidden_layers": 12,
|
| 18 |
-
"pad_token_id":
|
| 19 |
-
"
|
| 20 |
"torch_dtype": "float32",
|
| 21 |
"transformers_version": "4.50.0",
|
| 22 |
-
"
|
|
|
|
|
|
|
| 23 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
|
|
|
| 7 |
"hidden_act": "gelu",
|
| 8 |
"hidden_dropout_prob": 0.1,
|
| 9 |
+
"hidden_size": 384,
|
| 10 |
+
"id2label": {
|
| 11 |
+
"0": "LABEL_0"
|
| 12 |
+
},
|
| 13 |
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 1536,
|
| 15 |
+
"label2id": {
|
| 16 |
+
"LABEL_0": 0
|
| 17 |
+
},
|
| 18 |
+
"layer_norm_eps": 1e-12,
|
| 19 |
+
"max_position_embeddings": 512,
|
| 20 |
+
"model_type": "bert",
|
| 21 |
"num_attention_heads": 12,
|
| 22 |
"num_hidden_layers": 12,
|
| 23 |
+
"pad_token_id": 0,
|
| 24 |
+
"position_embedding_type": "absolute",
|
| 25 |
"torch_dtype": "float32",
|
| 26 |
"transformers_version": "4.50.0",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 30522
|
| 30 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18ac7948b64665eb9e4345c35b7735ca6456c30854498ad54f5bc6fc387b7b1c
|
| 3 |
+
size 133462128
|
sentence_bert_config.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
{
|
| 2 |
-
"max_seq_length":
|
| 3 |
-
"do_lower_case":
|
| 4 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"max_seq_length": 512,
|
| 3 |
+
"do_lower_case": true
|
| 4 |
}
|
special_tokens_map.json
CHANGED
|
@@ -1,41 +1,27 @@
|
|
| 1 |
{
|
| 2 |
-
"bos_token": {
|
| 3 |
-
"content": "<s>",
|
| 4 |
-
"lstrip": false,
|
| 5 |
-
"normalized": false,
|
| 6 |
-
"rstrip": false,
|
| 7 |
-
"single_word": false
|
| 8 |
-
},
|
| 9 |
"cls_token": {
|
| 10 |
-
"content": "
|
| 11 |
-
"lstrip": false,
|
| 12 |
-
"normalized": false,
|
| 13 |
-
"rstrip": false,
|
| 14 |
-
"single_word": false
|
| 15 |
-
},
|
| 16 |
-
"eos_token": {
|
| 17 |
-
"content": "</s>",
|
| 18 |
"lstrip": false,
|
| 19 |
"normalized": false,
|
| 20 |
"rstrip": false,
|
| 21 |
"single_word": false
|
| 22 |
},
|
| 23 |
"mask_token": {
|
| 24 |
-
"content": "
|
| 25 |
-
"lstrip":
|
| 26 |
"normalized": false,
|
| 27 |
"rstrip": false,
|
| 28 |
"single_word": false
|
| 29 |
},
|
| 30 |
"pad_token": {
|
| 31 |
-
"content": "
|
| 32 |
"lstrip": false,
|
| 33 |
"normalized": false,
|
| 34 |
"rstrip": false,
|
| 35 |
"single_word": false
|
| 36 |
},
|
| 37 |
"sep_token": {
|
| 38 |
-
"content": "
|
| 39 |
"lstrip": false,
|
| 40 |
"normalized": false,
|
| 41 |
"rstrip": false,
|
|
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"lstrip": false,
|
| 5 |
"normalized": false,
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
},
|
| 9 |
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
"normalized": false,
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
"lstrip": false,
|
| 19 |
"normalized": false,
|
| 20 |
"rstrip": false,
|
| 21 |
"single_word": false
|
| 22 |
},
|
| 23 |
"sep_token": {
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
"lstrip": false,
|
| 26 |
"normalized": false,
|
| 27 |
"rstrip": false,
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -1,73 +1,58 @@
|
|
| 1 |
{
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
"0": {
|
| 4 |
-
"content": "
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
| 7 |
"rstrip": false,
|
| 8 |
"single_word": false,
|
| 9 |
"special": true
|
| 10 |
},
|
| 11 |
-
"
|
| 12 |
-
"content": "
|
| 13 |
"lstrip": false,
|
| 14 |
"normalized": false,
|
| 15 |
"rstrip": false,
|
| 16 |
"single_word": false,
|
| 17 |
"special": true
|
| 18 |
},
|
| 19 |
-
"
|
| 20 |
-
"content": "
|
| 21 |
"lstrip": false,
|
| 22 |
"normalized": false,
|
| 23 |
"rstrip": false,
|
| 24 |
"single_word": false,
|
| 25 |
"special": true
|
| 26 |
},
|
| 27 |
-
"
|
| 28 |
-
"content": "
|
| 29 |
-
"lstrip": false,
|
| 30 |
-
"normalized": true,
|
| 31 |
-
"rstrip": false,
|
| 32 |
-
"single_word": false,
|
| 33 |
-
"special": true
|
| 34 |
-
},
|
| 35 |
-
"104": {
|
| 36 |
-
"content": "[UNK]",
|
| 37 |
"lstrip": false,
|
| 38 |
"normalized": false,
|
| 39 |
"rstrip": false,
|
| 40 |
"single_word": false,
|
| 41 |
"special": true
|
| 42 |
},
|
| 43 |
-
"
|
| 44 |
-
"content": "
|
| 45 |
-
"lstrip":
|
| 46 |
"normalized": false,
|
| 47 |
"rstrip": false,
|
| 48 |
"single_word": false,
|
| 49 |
"special": true
|
| 50 |
}
|
| 51 |
},
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
"do_lower_case": true,
|
| 56 |
-
"eos_token": "</s>",
|
| 57 |
"extra_special_tokens": {},
|
| 58 |
-
"mask_token": "
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"pad_token_type_id": 0,
|
| 64 |
-
"padding_side": "right",
|
| 65 |
-
"sep_token": "</s>",
|
| 66 |
-
"stride": 0,
|
| 67 |
"strip_accents": null,
|
| 68 |
"tokenize_chinese_chars": true,
|
| 69 |
-
"tokenizer_class": "
|
| 70 |
-
"truncation_side": "right",
|
| 71 |
-
"truncation_strategy": "longest_first",
|
| 72 |
"unk_token": "[UNK]"
|
| 73 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
| 7 |
"rstrip": false,
|
| 8 |
"single_word": false,
|
| 9 |
"special": true
|
| 10 |
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
"lstrip": false,
|
| 14 |
"normalized": false,
|
| 15 |
"rstrip": false,
|
| 16 |
"single_word": false,
|
| 17 |
"special": true
|
| 18 |
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
"lstrip": false,
|
| 22 |
"normalized": false,
|
| 23 |
"rstrip": false,
|
| 24 |
"single_word": false,
|
| 25 |
"special": true
|
| 26 |
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"lstrip": false,
|
| 30 |
"normalized": false,
|
| 31 |
"rstrip": false,
|
| 32 |
"single_word": false,
|
| 33 |
"special": true
|
| 34 |
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
"normalized": false,
|
| 39 |
"rstrip": false,
|
| 40 |
"single_word": false,
|
| 41 |
"special": true
|
| 42 |
}
|
| 43 |
},
|
| 44 |
+
"clean_up_tokenization_spaces": true,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
"do_lower_case": true,
|
|
|
|
| 48 |
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"model_max_length": 512,
|
| 51 |
+
"never_split": null,
|
| 52 |
+
"pad_token": "[PAD]",
|
| 53 |
+
"sep_token": "[SEP]",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
"strip_accents": null,
|
| 55 |
"tokenize_chinese_chars": true,
|
| 56 |
+
"tokenizer_class": "BertTokenizer",
|
|
|
|
|
|
|
| 57 |
"unk_token": "[UNK]"
|
| 58 |
}
|
vocab.txt
CHANGED
|
@@ -1,7 +1,3 @@
|
|
| 1 |
-
<s>
|
| 2 |
-
<pad>
|
| 3 |
-
</s>
|
| 4 |
-
<unk>
|
| 5 |
[PAD]
|
| 6 |
[unused0]
|
| 7 |
[unused1]
|
|
@@ -30524,4 +30520,3 @@ necessitated
|
|
| 30524 |
##:
|
| 30525 |
##?
|
| 30526 |
##~
|
| 30527 |
-
<mask>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
[PAD]
|
| 2 |
[unused0]
|
| 3 |
[unused1]
|
|
|
|
| 30520 |
##:
|
| 30521 |
##?
|
| 30522 |
##~
|
|
|