|
from transformers import * |
|
|
|
|
|
TOKEN_IDX = { |
|
'bert': { |
|
'START_SEQ': 101, |
|
'PAD': 0, |
|
'END_SEQ': 102, |
|
'UNK': 100 |
|
}, |
|
'xlm': { |
|
'START_SEQ': 0, |
|
'PAD': 2, |
|
'END_SEQ': 1, |
|
'UNK': 3 |
|
}, |
|
'roberta': { |
|
'START_SEQ': 0, |
|
'PAD': 1, |
|
'END_SEQ': 2, |
|
'UNK': 3 |
|
}, |
|
'albert': { |
|
'START_SEQ': 2, |
|
'PAD': 0, |
|
'END_SEQ': 3, |
|
'UNK': 1 |
|
}, |
|
} |
|
|
|
|
|
punctuation_dict = { |
|
'0': 0, |
|
"DARI": 1, |
|
"COMMA": 2, |
|
"SEMICOLON": 3, |
|
"QUESTION": 4, |
|
"EXCLAMATION": 5, |
|
"COLON": 6, |
|
"HYPHEN": 7, |
|
} |
|
|
|
punctuation_map = { |
|
0: "", |
|
1: '।', |
|
2: ',', |
|
3: ';', |
|
4: '?', |
|
5: '!', |
|
6: ':', |
|
7: '-', |
|
} |
|
|
|
|
|
MODELS = { |
|
'bert-base-uncased': (BertModel, BertTokenizer, 768, 'bert'), |
|
'bert-large-uncased': (BertModel, BertTokenizer, 1024, 'bert'), |
|
'bert-base-multilingual-cased': (BertModel, BertTokenizer, 768, 'bert'), |
|
'bert-base-multilingual-uncased': (BertModel, BertTokenizer, 768, 'bert'), |
|
'sagorsarker/bangla-bert-base': (BertModel, BertTokenizer, 768, 'bert'), |
|
|
|
'xlm-mlm-en-2048': (XLMModel, XLMTokenizer, 2048, 'xlm'), |
|
'xlm-mlm-100-1280': (XLMModel, XLMTokenizer, 1280, 'xlm'), |
|
'roberta-base': (RobertaModel, RobertaTokenizer, 768, 'roberta'), |
|
'roberta-large': (RobertaModel, RobertaTokenizer, 1024, 'roberta'), |
|
'neuralspace-reverie/indic-transformers-bn-roberta': (RobertaModel, RobertaTokenizer, 768, 'roberta'), |
|
'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'), |
|
'distilbert-base-multilingual-cased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'), |
|
'./distilbert-base-multilingual-cased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'), |
|
'xlm-roberta-base': (XLMRobertaModel, XLMRobertaTokenizer, 768, 'roberta'), |
|
'xlm-roberta-large': (XLMRobertaModel, XLMRobertaTokenizer, 1024, 'roberta'), |
|
'albert-base-v1': (AlbertModel, AlbertTokenizer, 768, 'albert'), |
|
'albert-base-v2': (AlbertModel, AlbertTokenizer, 768, 'albert'), |
|
'albert-large-v2': (AlbertModel, AlbertTokenizer, 1024, 'albert'), |
|
} |
|
|