abdullahalmunem's picture
model added
f81cfe2
from transformers import *
# special tokens indices in different models available in transformers
TOKEN_IDX = {
'bert': {
'START_SEQ': 101,
'PAD': 0,
'END_SEQ': 102,
'UNK': 100
},
'xlm': {
'START_SEQ': 0,
'PAD': 2,
'END_SEQ': 1,
'UNK': 3
},
'roberta': {
'START_SEQ': 0,
'PAD': 1,
'END_SEQ': 2,
'UNK': 3
},
'albert': {
'START_SEQ': 2,
'PAD': 0,
'END_SEQ': 3,
'UNK': 1
},
}
# 'O' -> No punctuation
punctuation_dict = {
'0': 0,
"DARI": 1,
"COMMA": 2,
"SEMICOLON": 3,
"QUESTION": 4,
"EXCLAMATION": 5,
"COLON": 6,
"HYPHEN": 7,
}
punctuation_map = {
0: "",
1: '।', # 'DARI'
2: ',', # 'COMMA'
3: ';', # 'SEMICOLON'
4: '?', # 'QUESTION'
5: '!', # 'EXCLAMATION'
6: ':', # 'COLON'
7: '-', # 'HYPHEN'
}
# pretrained model name: (model class, model tokenizer, output dimension, token style)
MODELS = {
'bert-base-uncased': (BertModel, BertTokenizer, 768, 'bert'),
'bert-large-uncased': (BertModel, BertTokenizer, 1024, 'bert'),
'bert-base-multilingual-cased': (BertModel, BertTokenizer, 768, 'bert'),
'bert-base-multilingual-uncased': (BertModel, BertTokenizer, 768, 'bert'),
'sagorsarker/bangla-bert-base': (BertModel, BertTokenizer, 768, 'bert'),
# 'distilbert-base-multilingual-cased': (AutoModelForMaskedLM, AutoTokenizer, 768, 'bert'),
'xlm-mlm-en-2048': (XLMModel, XLMTokenizer, 2048, 'xlm'),
'xlm-mlm-100-1280': (XLMModel, XLMTokenizer, 1280, 'xlm'),
'roberta-base': (RobertaModel, RobertaTokenizer, 768, 'roberta'),
'roberta-large': (RobertaModel, RobertaTokenizer, 1024, 'roberta'),
'neuralspace-reverie/indic-transformers-bn-roberta': (RobertaModel, RobertaTokenizer, 768, 'roberta'),
'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'),
'distilbert-base-multilingual-cased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'),
'./distilbert-base-multilingual-cased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'),
'xlm-roberta-base': (XLMRobertaModel, XLMRobertaTokenizer, 768, 'roberta'),
'xlm-roberta-large': (XLMRobertaModel, XLMRobertaTokenizer, 1024, 'roberta'),
'albert-base-v1': (AlbertModel, AlbertTokenizer, 768, 'albert'),
'albert-base-v2': (AlbertModel, AlbertTokenizer, 768, 'albert'),
'albert-large-v2': (AlbertModel, AlbertTokenizer, 1024, 'albert'),
}