Upload tokenizer
Browse files- isoformer_tokenizer.py +1 -6
isoformer_tokenizer.py
CHANGED
|
@@ -36,7 +36,6 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
| 36 |
|
| 37 |
def __init__(
|
| 38 |
self,
|
| 39 |
-
config,
|
| 40 |
**kwargs
|
| 41 |
):
|
| 42 |
|
|
@@ -55,9 +54,6 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
| 55 |
# protein_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
|
| 56 |
# protein_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
|
| 57 |
|
| 58 |
-
self.num_tokens_per_seq_nuctf = config.num_tokens_per_seq_nuctf
|
| 59 |
-
self.num_tokens_per_seq_nuctf_rna = config.num_tokens_per_seq_nuctf_rna
|
| 60 |
-
self.num_protein_tokens_per_seq = config.num_protein_tokens_per_seq
|
| 61 |
self.dna_tokenizer = dna_hf_tokenizer
|
| 62 |
self.rna_tokenizer = rna_hf_tokenizer
|
| 63 |
self.protein_tokenizer = protein_hf_tokenizer
|
|
@@ -65,12 +61,11 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
| 65 |
self.dna_tokens = open("dna_vocab_list.txt", "r").read() .split("\n")
|
| 66 |
self.rna_tokens = open("rna_vocab_list.txt", "r").read() .split("\n")
|
| 67 |
self.protein_tokens = open("protein_vocab_list.txt", "r").read() .split("\n")
|
| 68 |
-
self.config = config
|
| 69 |
|
| 70 |
super().__init__(**kwargs)
|
| 71 |
|
| 72 |
def __call__(self, dna_input, rna_input, protein_input):
|
| 73 |
-
dna_output = self.dna_tokenizer(dna_input)
|
| 74 |
rna_output = self.rna_tokenizer(rna_input, max_length=1024, padding="max_length")
|
| 75 |
protein_output = self.protein_tokenizer(protein_input, max_length=1024, padding="max_length")
|
| 76 |
return dna_output, rna_output, protein_output
|
|
|
|
| 36 |
|
| 37 |
def __init__(
|
| 38 |
self,
|
|
|
|
| 39 |
**kwargs
|
| 40 |
):
|
| 41 |
|
|
|
|
| 54 |
# protein_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
|
| 55 |
# protein_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
|
| 56 |
|
|
|
|
|
|
|
|
|
|
| 57 |
self.dna_tokenizer = dna_hf_tokenizer
|
| 58 |
self.rna_tokenizer = rna_hf_tokenizer
|
| 59 |
self.protein_tokenizer = protein_hf_tokenizer
|
|
|
|
| 61 |
self.dna_tokens = open("dna_vocab_list.txt", "r").read() .split("\n")
|
| 62 |
self.rna_tokens = open("rna_vocab_list.txt", "r").read() .split("\n")
|
| 63 |
self.protein_tokens = open("protein_vocab_list.txt", "r").read() .split("\n")
|
|
|
|
| 64 |
|
| 65 |
super().__init__(**kwargs)
|
| 66 |
|
| 67 |
def __call__(self, dna_input, rna_input, protein_input):
|
| 68 |
+
dna_output = self.dna_tokenizer(dna_input)
|
| 69 |
rna_output = self.rna_tokenizer(rna_input, max_length=1024, padding="max_length")
|
| 70 |
protein_output = self.protein_tokenizer(protein_input, max_length=1024, padding="max_length")
|
| 71 |
return dna_output, rna_output, protein_output
|