Upload CamemBERT-v2 multitask classifier checkpoint-49500

Browse files

Files changed (10) hide show

config.json +526 -0
model.safetensors +3 -0
multitask_transformer/__pycache__/configuration_multitask.cpython-312.pyc +0 -0
multitask_transformer/__pycache__/modeling_multitask.cpython-312.pyc +0 -0
multitask_transformer/configuration_multitask.py +26 -0
multitask_transformer/modeling_multitask.py +198 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,526 @@

+{
+  "architectures": [
+    "MultiTaskClsModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 1,
+  "classifier_dropout": null,
+  "embedding_size": 768,
+  "eos_token_id": 2,
+  "finetuning_task": "text-classification",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label_dict": {
+    "age_group": {
+      "0": "adult",
+      "1": "elderly",
+      "2": "not_specified",
+      "3": "pediatric"
+    },
+    "assertion_type": {
+      "0": "factual",
+      "1": "hypothetical",
+      "2": "mixed",
+      "3": "opinion",
+      "4": "recommendation"
+    },
+    "certainty_level": {
+      "0": "definitive",
+      "1": "possible",
+      "2": "probable",
+      "3": "uncertain"
+    },
+    "contains_abbreviations": {
+      "0": "0",
+      "1": "1"
+    },
+    "contains_bias": {
+      "0": "0",
+      "1": "1"
+    },
+    "contains_numbers": {
+      "0": "0",
+      "1": "1"
+    },
+    "content_novelty": {
+      "0": "established",
+      "1": "outdated",
+      "2": "recent_developments"
+    },
+    "content_richness": {
+      "0": "1",
+      "1": "2",
+      "2": "3",
+      "3": "4",
+      "4": "5"
+    },
+    "content_type": {
+      "0": "background_review",
+      "1": "clinical_guidance",
+      "2": "drug_information",
+      "3": "medical_knowledge",
+      "4": "other",
+      "5": "patient_case",
+      "6": "policy_administrative",
+      "7": "research_findings",
+      "8": "research_methodology"
+    },
+    "educational_score": {
+      "0": "1",
+      "1": "2",
+      "2": "3",
+      "3": "4",
+      "4": "5"
+    },
+    "interactive_elements": {
+      "0": "instructions",
+      "1": "none",
+      "2": "questions",
+      "3": "tasks"
+    },
+    "list_format": {
+      "0": "0",
+      "1": "1"
+    },
+    "medical_subfield": {
+      "0": "anatomical_pathology",
+      "1": "anesthesiology",
+      "2": "biology_medicine",
+      "3": "cardiology",
+      "4": "dentistry",
+      "5": "dermatology",
+      "6": "digestive_surgery",
+      "7": "endocrinology",
+      "8": "gastroenterology",
+      "9": "general_medicine",
+      "10": "general_surgery",
+      "11": "genetics",
+      "12": "geriatrics",
+      "13": "gynecology_medical",
+      "14": "gynecology_obstetrics",
+      "15": "hematology",
+      "16": "intensive_care",
+      "17": "internal_medicine",
+      "18": "maxillofacial_surgery",
+      "19": "midwifery",
+      "20": "nephrology",
+      "21": "neurology",
+      "22": "neurosurgery",
+      "23": "nuclear_medicine",
+      "24": "occupational_medicine",
+      "25": "oncology",
+      "26": "ophthalmology",
+      "27": "oral_surgery",
+      "28": "orthodontics",
+      "29": "orthopedic_surgery",
+      "30": "other",
+      "31": "otolaryngology",
+      "32": "pediatric_surgery",
+      "33": "pediatrics",
+      "34": "pharmacy",
+      "35": "plastic_surgery",
+      "36": "pneumology",
+      "37": "psychiatry",
+      "38": "public_health",
+      "39": "radiology",
+      "40": "rehabilitation",
+      "41": "rheumatology",
+      "42": "thoracic_surgery",
+      "43": "urologic_surgery",
+      "44": "vascular_surgery"
+    },
+    "pretraining_suitable": {
+      "0": "0",
+      "1": "1"
+    },
+    "rewriting_needed": {
+      "0": "0",
+      "1": "1"
+    },
+    "sex": {
+      "0": "female",
+      "1": "male",
+      "2": "not_specified"
+    },
+    "terminology_precision": {
+      "0": "1",
+      "1": "2",
+      "2": "3",
+      "3": "4",
+      "4": "5"
+    },
+    "text_type": {
+      "0": "incomplete",
+      "1": "meaningful"
+    },
+    "writing_quality": {
+      "0": "1",
+      "1": "2",
+      "2": "3",
+      "3": "4",
+      "4": "5"
+    },
+    "writing_style": {
+      "0": "academic",
+      "1": "clinical",
+      "2": "other",
+      "3": "pedagogical",
+      "4": "regulatory"
+    }
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id_dict": {
+    "age_group": {
+      "adult": 0,
+      "elderly": 1,
+      "not_specified": 2,
+      "pediatric": 3
+    },
+    "assertion_type": {
+      "factual": 0,
+      "hypothetical": 1,
+      "mixed": 2,
+      "opinion": 3,
+      "recommendation": 4
+    },
+    "certainty_level": {
+      "definitive": 0,
+      "possible": 1,
+      "probable": 2,
+      "uncertain": 3
+    },
+    "contains_abbreviations": {
+      "0": 0,
+      "1": 1
+    },
+    "contains_bias": {
+      "0": 0,
+      "1": 1
+    },
+    "contains_numbers": {
+      "0": 0,
+      "1": 1
+    },
+    "content_novelty": {
+      "established": 0,
+      "outdated": 1,
+      "recent_developments": 2
+    },
+    "content_richness": {
+      "1": 0,
+      "2": 1,
+      "3": 2,
+      "4": 3,
+      "5": 4
+    },
+    "content_type": {
+      "background_review": 0,
+      "clinical_guidance": 1,
+      "drug_information": 2,
+      "medical_knowledge": 3,
+      "other": 4,
+      "patient_case": 5,
+      "policy_administrative": 6,
+      "research_findings": 7,
+      "research_methodology": 8
+    },
+    "educational_score": {
+      "1": 0,
+      "2": 1,
+      "3": 2,
+      "4": 3,
+      "5": 4
+    },
+    "interactive_elements": {
+      "instructions": 0,
+      "none": 1,
+      "questions": 2,
+      "tasks": 3
+    },
+    "list_format": {
+      "0": 0,
+      "1": 1
+    },
+    "medical_subfield": {
+      "anatomical_pathology": 0,
+      "anesthesiology": 1,
+      "biology_medicine": 2,
+      "cardiology": 3,
+      "dentistry": 4,
+      "dermatology": 5,
+      "digestive_surgery": 6,
+      "endocrinology": 7,
+      "gastroenterology": 8,
+      "general_medicine": 9,
+      "general_surgery": 10,
+      "genetics": 11,
+      "geriatrics": 12,
+      "gynecology_medical": 13,
+      "gynecology_obstetrics": 14,
+      "hematology": 15,
+      "intensive_care": 16,
+      "internal_medicine": 17,
+      "maxillofacial_surgery": 18,
+      "midwifery": 19,
+      "nephrology": 20,
+      "neurology": 21,
+      "neurosurgery": 22,
+      "nuclear_medicine": 23,
+      "occupational_medicine": 24,
+      "oncology": 25,
+      "ophthalmology": 26,
+      "oral_surgery": 27,
+      "orthodontics": 28,
+      "orthopedic_surgery": 29,
+      "other": 30,
+      "otolaryngology": 31,
+      "pediatric_surgery": 32,
+      "pediatrics": 33,
+      "pharmacy": 34,
+      "plastic_surgery": 35,
+      "pneumology": 36,
+      "psychiatry": 37,
+      "public_health": 38,
+      "radiology": 39,
+      "rehabilitation": 40,
+      "rheumatology": 41,
+      "thoracic_surgery": 42,
+      "urologic_surgery": 43,
+      "vascular_surgery": 44
+    },
+    "pretraining_suitable": {
+      "0": 0,
+      "1": 1
+    },
+    "rewriting_needed": {
+      "0": 0,
+      "1": 1
+    },
+    "sex": {
+      "female": 0,
+      "male": 1,
+      "not_specified": 2
+    },
+    "terminology_precision": {
+      "1": 0,
+      "2": 1,
+      "3": 2,
+      "4": 3,
+      "5": 4
+    },
+    "text_type": {
+      "incomplete": 0,
+      "meaningful": 1
+    },
+    "writing_quality": {
+      "1": 0,
+      "2": 1,
+      "3": 2,
+      "4": 3,
+      "5": 4
+    },
+    "writing_style": {
+      "academic": 0,
+      "clinical": 1,
+      "other": 2,
+      "pedagogical": 3,
+      "regulatory": 4
+    }
+  },
+  "labels_list": [
+    [
+      "1",
+      "2",
+      "3",
+      "4",
+      "5"
+    ],
+    [
+      "1",
+      "2",
+      "3",
+      "4",
+      "5"
+    ],
+    [
+      "1",
+      "2",
+      "3",
+      "4",
+      "5"
+    ],
+    [
+      "1",
+      "2",
+      "3",
+      "4",
+      "5"
+    ],
+    [
+      "0",
+      "1"
+    ],
+    [
+      "0",
+      "1"
+    ],
+    [
+      "0",
+      "1"
+    ],
+    [
+      "academic",
+      "clinical",
+      "other",
+      "pedagogical",
+      "regulatory"
+    ],
+    [
+      "background_review",
+      "clinical_guidance",
+      "drug_information",
+      "medical_knowledge",
+      "other",
+      "patient_case",
+      "policy_administrative",
+      "research_findings",
+      "research_methodology"
+    ],
+    [
+      "anatomical_pathology",
+      "anesthesiology",
+      "biology_medicine",
+      "cardiology",
+      "dentistry",
+      "dermatology",
+      "digestive_surgery",
+      "endocrinology",
+      "gastroenterology",
+      "general_medicine",
+      "general_surgery",
+      "genetics",
+      "geriatrics",
+      "gynecology_medical",
+      "gynecology_obstetrics",
+      "hematology",
+      "intensive_care",
+      "internal_medicine",
+      "maxillofacial_surgery",
+      "midwifery",
+      "nephrology",
+      "neurology",
+      "neurosurgery",
+      "nuclear_medicine",
+      "occupational_medicine",
+      "oncology",
+      "ophthalmology",
+      "oral_surgery",
+      "orthodontics",
+      "orthopedic_surgery",
+      "other",
+      "otolaryngology",
+      "pediatric_surgery",
+      "pediatrics",
+      "pharmacy",
+      "plastic_surgery",
+      "pneumology",
+      "psychiatry",
+      "public_health",
+      "radiology",
+      "rehabilitation",
+      "rheumatology",
+      "thoracic_surgery",
+      "urologic_surgery",
+      "vascular_surgery"
+    ],
+    [
+      "adult",
+      "elderly",
+      "not_specified",
+      "pediatric"
+    ],
+    [
+      "female",
+      "male",
+      "not_specified"
+    ],
+    [
+      "factual",
+      "hypothetical",
+      "mixed",
+      "opinion",
+      "recommendation"
+    ],
+    [
+      "definitive",
+      "possible",
+      "probable",
+      "uncertain"
+    ],
+    [
+      "0",
+      "1"
+    ],
+    [
+      "0",
+      "1"
+    ],
+    [
+      "0",
+      "1"
+    ],
+    [
+      "instructions",
+      "none",
+      "questions",
+      "tasks"
+    ],
+    [
+      "established",
+      "outdated",
+      "recent_developments"
+    ],
+    [
+      "incomplete",
+      "meaningful"
+    ]
+  ],
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 1025,
+  "model_name": "camembertv2-base",
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_biased_input": true,
+  "position_embedding_type": "absolute",
+  "problem_types": [
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification",
+    "single_label_classification"
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 32768
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34b2a11e468c40c80b7d6cc7c451b6e2f95f80a544ed8597fa2e7452d976b8d5
+size 449148280

multitask_transformer/__pycache__/configuration_multitask.cpython-312.pyc ADDED Viewed

Binary file (1.26 kB). View file

multitask_transformer/__pycache__/modeling_multitask.cpython-312.pyc ADDED Viewed

Binary file (10.1 kB). View file

multitask_transformer/configuration_multitask.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from transformers import AutoConfig, PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MultiTaskClsConfig(PretrainedConfig):
+    model_type = "multitaskcls"
+    def __init__(
+        self,
+        problem_types=None,
+        labels_list=None,
+        label2id_dict=None,
+        id2label_dict=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # create attributes from the keys in kwargs
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        self.num_tasks = len(labels_list) if labels_list is not None else 0
+        self.labels_list = labels_list
+        self.problem_types = problem_types
+        self.label2id_dict = label2id_dict
+        self.id2label_dict = id2label_dict

multitask_transformer/modeling_multitask.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import importlib
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import AutoModel, PreTrainedModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
+from transformers.utils import ModelOutput, logging
+from .configuration_multitask import MultiTaskClsConfig
+logger = logging.get_logger(__name__)
+@dataclass
+class MultiTaskSequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits_list: List[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+class MultiTaskClsModel(PreTrainedModel):
+    config_class = MultiTaskClsConfig
+    def __init__(self, config: MultiTaskClsConfig):
+        super().__init__(config)
+        model_cls_str = MODEL_MAPPING_NAMES[config.model_type]
+        model_cls = getattr(importlib.import_module("transformers"), model_cls_str)
+        transformer_encoder = model_cls._from_config(config)
+        self.model_prefix = transformer_encoder.base_model_prefix
+        # create a variable with the same name as the prefix
+        setattr(self, self.model_prefix, transformer_encoder)
+        classifier_dropout = (
+            config.classifier_dropout
+            if config.classifier_dropout is not None
+            else config.hidden_dropout_prob
+        )
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.num_tasks = len(config.problem_types)
+        self.labels_list = config.labels_list
+        self.num_labels = [
+            len(labels) if labels is not None else 1 for labels in self.labels_list
+        ]
+        self.problem_types = (
+            [None] * self.num_tasks
+            if config.problem_types is None
+            else config.problem_types
+        )
+        self.cls_task_heads = nn.ModuleList(
+            [
+                nn.Linear(self.config.hidden_size, _num_labels)
+                for _num_labels in self.num_labels
+            ]
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[List[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], List[MultiTaskSequenceClassifierOutput]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # get attributes from the self.model_prefix
+        transformer_encoder = getattr(self, self.model_prefix)
+        outputs = transformer_encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        # List of logits for each task
+        logits_list = [task_head(pooled_output) for task_head in self.cls_task_heads]
+        losses = []
+        loss = None
+        if labels is not None:
+            for logits, task_labels, task_type, num_labels in zip(
+                logits_list, labels, self.problem_types, self.num_labels
+            ):
+                if task_type is None:
+                    if num_labels == 1:
+                        task_type = "regression"
+                    elif num_labels > 1 and (
+                        task_labels.dtype == torch.long
+                        or task_labels.dtype == torch.int
+                    ):
+                        task_type = "single_label_classification"
+                    else:
+                        task_type = "multi_label_classification"
+                if task_type == "regression":
+                    loss_fct = nn.MSELoss()
+                    if num_labels == 1:
+                        loss = loss_fct(logits.squeeze(), task_labels.squeeze())
+                    else:
+                        loss = loss_fct(logits, task_labels)
+                elif task_type == "single_label_classification":
+                    loss_fct = nn.CrossEntropyLoss()
+                    if task_labels.shape == logits.view(-1, num_labels).shape:
+                        loss = loss_fct(logits.view(-1, num_labels), task_labels)
+                    else:
+                        loss = loss_fct(
+                            logits.view(-1, num_labels), task_labels.view(-1)
+                        )
+                elif task_type == "multi_label_classification":
+                    loss_fct = nn.BCEWithLogitsLoss()
+                    loss = loss_fct(logits, task_labels)
+                else:
+                    raise ValueError(f"Task type '{task_type}' not supported")
+                losses.append(loss)
+            loss = torch.stack(losses).sum()
+        if not return_dict:
+            output = (logits_list,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultiTaskSequenceClassifierOutput(
+            loss=loss,
+            logits_list=logits_list,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1024,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff