File size: 4,192 Bytes
9104d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
NeuralQuantum Ollama Tokenizer for Hugging Face Transformers
"""

import json
from typing import List, Optional, Union
from transformers import PreTrainedTokenizer


class NeuralQuantumOllamaTokenizer(PreTrainedTokenizer):
    """Tokenizer for NeuralQuantum Ollama model"""
    
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        pad_token="<|endoftext|>",
        quantum_token="<|quantum|>",
        classical_token="<|classical|>",
        system_token="<|system|>",
        user_token="<|user|>",
        assistant_token="<|assistant|>",
        add_prefix_space=False,
        **kwargs
    ):
        # Simple vocabulary for demonstration
        vocab = {
            "<|endoftext|>": 0,
            "<|quantum|>": 1,
            "<|classical|>": 2,
            "<|system|>": 3,
            "<|user|>": 4,
            "<|assistant|>": 5,
        }
        
        # Add basic vocabulary
        for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"):
            vocab[char] = i + 6
        
        # Set vocab before calling super().__init__
        self._vocab = vocab
        self._ids_to_tokens = {v: k for k, v in vocab.items()}
            
        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            add_prefix_space=add_prefix_space,
            **kwargs
        )
        
        self.quantum_token = quantum_token
        self.classical_token = classical_token
        self.system_token = system_token
        self.user_token = user_token
        self.assistant_token = assistant_token
        
    @property
    def vocab_size(self):
        return len(self._vocab)
        
    def get_vocab(self):
        return dict(self._vocab)
        
    def _tokenize(self, text):
        """Basic tokenization - split by whitespace and characters"""
        tokens = []
        current_token = ""
        
        for char in text:
            if char.isspace():
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            else:
                current_token += char
                
        if current_token:
            tokens.append(current_token)
            
        return tokens
        
    def _convert_token_to_id(self, token):
        """Convert token to ID"""
        return self._vocab.get(token, self._vocab[self.unk_token])
        
    def _convert_id_to_token(self, index):
        """Convert ID to token"""
        return self._ids_to_tokens.get(index, self.unk_token)
        
    def convert_tokens_to_string(self, tokens):
        """Convert tokens back to string"""
        return " ".join(tokens)
        
    def save_vocabulary(self, save_directory, filename_prefix=None):
        """Save vocabulary to files"""
        vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
        vocab_path = f"{save_directory}/{vocab_file}"
        
        with open(vocab_path, 'w') as f:
            json.dump(self._vocab, f, indent=2)
            
        return (vocab_path,)
        
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """Build input with special tokens for Ollama format"""
        if token_ids_1 is None:
            return token_ids_0 + [self.eos_token_id]
        return token_ids_0 + token_ids_1 + [self.eos_token_id]
        
    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """Get special tokens mask"""
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
            
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]