nahiar commited on
Commit
099e31f
·
verified ·
1 Parent(s): 3db365f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,10 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
10
+ sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
 
README.md CHANGED
@@ -1,3 +1,141 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - multilingual
4
+ - en
5
+ - fr
6
+ - es
7
+ - de
8
+ - el
9
+ - bg
10
+ - ru
11
+ - tr
12
+ - ar
13
+ - vi
14
+ - th
15
+ - zh
16
+ - hi
17
+ - sw
18
+ - ur
19
+ tags:
20
+ - text-classification
21
+ - pytorch
22
+ - tensorflow
23
+ datasets:
24
+ - multi_nli
25
+ - xnli
26
+ license: mit
27
+ pipeline_tag: zero-shot-classification
28
+ widget:
29
+ - text: "За кого вы голосуете в 2020 году?"
30
+ candidate_labels: "politique étrangère, Europe, élections, affaires, politique"
31
+ multi_class: true
32
+ - text: "لمن تصوت في 2020؟"
33
+ candidate_labels: "السياسة الخارجية, أوروبا, الانتخابات, الأعمال, السياسة"
34
+ multi_class: true
35
+ - text: "2020'de kime oy vereceksiniz?"
36
+ candidate_labels: "dış politika, Avrupa, seçimler, ticaret, siyaset"
37
+ multi_class: true
38
+ ---
39
+
40
+ # xlm-roberta-large-xnli
41
+
42
+ ## Model Description
43
+
44
+ This model takes [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) and fine-tunes it on a combination of NLI data in 15 languages. It is intended to be used for zero-shot text classification, such as with the Hugging Face [ZeroShotClassificationPipeline](https://huggingface.co/transformers/master/main_classes/pipelines.html#transformers.ZeroShotClassificationPipeline).
45
+
46
+ ## Intended Usage
47
+
48
+ This model is intended to be used for zero-shot text classification, especially in languages other than English. It is fine-tuned on XNLI, which is a multilingual NLI dataset. The model can therefore be used with any of the languages in the XNLI corpus:
49
+
50
+ - English
51
+ - French
52
+ - Spanish
53
+ - German
54
+ - Greek
55
+ - Bulgarian
56
+ - Russian
57
+ - Turkish
58
+ - Arabic
59
+ - Vietnamese
60
+ - Thai
61
+ - Chinese
62
+ - Hindi
63
+ - Swahili
64
+ - Urdu
65
+
66
+ Since the base model was pre-trained trained on 100 different languages, the
67
+ model has shown some effectiveness in languages beyond those listed above as
68
+ well. See the full list of pre-trained languages in appendix A of the
69
+ [XLM Roberata paper](https://arxiv.org/abs/1911.02116)
70
+
71
+ For English-only classification, it is recommended to use
72
+ [bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) or
73
+ [a distilled bart MNLI model](https://huggingface.co/models?filter=pipeline_tag%3Azero-shot-classification&search=valhalla).
74
+
75
+ #### With the zero-shot classification pipeline
76
+
77
+ The model can be loaded with the `zero-shot-classification` pipeline like so:
78
+
79
+ ```python
80
+ from transformers import pipeline
81
+ classifier = pipeline("zero-shot-classification",
82
+ model="joeddav/xlm-roberta-large-xnli")
83
+ ```
84
+
85
+ You can then classify in any of the above languages. You can even pass the labels in one language and the sequence to
86
+ classify in another:
87
+
88
+ ```python
89
+ # we will classify the Russian translation of, "Who are you voting for in 2020?"
90
+ sequence_to_classify = "За кого вы голосуете в 2020 году?"
91
+ # we can specify candidate labels in Russian or any other language above:
92
+ candidate_labels = ["Europe", "public health", "politics"]
93
+ classifier(sequence_to_classify, candidate_labels)
94
+ # {'labels': ['politics', 'Europe', 'public health'],
95
+ # 'scores': [0.9048484563827515, 0.05722189322113991, 0.03792969882488251],
96
+ # 'sequence': 'За кого вы голосуете в 2020 году?'}
97
+ ```
98
+
99
+ The default hypothesis template is the English, `This text is {}`. If you are working strictly within one language, it
100
+ may be worthwhile to translate this to the language you are working with:
101
+
102
+ ```python
103
+ sequence_to_classify = "¿A quién vas a votar en 2020?"
104
+ candidate_labels = ["Europa", "salud pública", "política"]
105
+ hypothesis_template = "Este ejemplo es {}."
106
+ classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)
107
+ # {'labels': ['política', 'Europa', 'salud pública'],
108
+ # 'scores': [0.9109585881233215, 0.05954807624220848, 0.029493311420083046],
109
+ # 'sequence': '¿A quién vas a votar en 2020?'}
110
+ ```
111
+
112
+ #### With manual PyTorch
113
+
114
+ ```python
115
+ # pose sequence as a NLI premise and label as a hypothesis
116
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
117
+ nli_model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
118
+ tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
119
+
120
+ premise = sequence
121
+ hypothesis = f'This example is {label}.'
122
+
123
+ # run through model pre-trained on MNLI
124
+ x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
125
+ truncation_strategy='only_first')
126
+ logits = nli_model(x.to(device))[0]
127
+
128
+ # we throw away "neutral" (dim 1) and take the probability of
129
+ # "entailment" (2) as the probability of the label being true
130
+ entail_contradiction_logits = logits[:,[0,2]]
131
+ probs = entail_contradiction_logits.softmax(dim=1)
132
+ prob_label_is_true = probs[:,1]
133
+ ```
134
+
135
+ ## Training
136
+
137
+ This model was pre-trained on set of 100 languages, as described in
138
+ [the original paper](https://arxiv.org/abs/1911.02116). It was then fine-tuned on the task of NLI on the concatenated
139
+ MNLI train set and the XNLI validation and test sets. Finally, it was trained for one additional epoch on only XNLI
140
+ data where the translations for the premise and hypothesis are shuffled such that the premise and hypothesis for
141
+ each example come from the same original English example but the premise and hypothesis are of different languages.
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "contradiction",
14
+ "1": "neutral",
15
+ "2": "entailment"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 4096,
19
+ "label2id": {
20
+ "contradiction": 0,
21
+ "entailment": 2,
22
+ "neutral": 1
23
+ },
24
+ "layer_norm_eps": 1e-05,
25
+ "max_position_embeddings": 514,
26
+ "model_type": "xlm-roberta",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "output_past": true,
30
+ "pad_token_id": 1,
31
+ "type_vocab_size": 1,
32
+ "vocab_size": 250002
33
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8869b0c99ad35ec8a8c92434b54383d2dfd7db8cd460e28b9944a407e3a423e4
3
+ size 2243825580
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:947e0cebe09808e3279f38ceeca58af60c1fcebefbdfa54642285f35a0f8ec57
3
+ size 2243942751
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:913bd557db7f4e2f85c4d94c4ba30b3342457f6d2d783a8be141ae4018f73325
3
+ size 2244296816
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 512}