|  | """ | 
					
						
						|  | NeuralQuantum Ollama Tokenizer for Hugging Face Transformers | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | import json | 
					
						
						|  | from typing import List, Optional, Union | 
					
						
						|  | from transformers import PreTrainedTokenizer | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class NeuralQuantumOllamaTokenizer(PreTrainedTokenizer): | 
					
						
						|  | """Tokenizer for NeuralQuantum Ollama model""" | 
					
						
						|  |  | 
					
						
						|  | def __init__( | 
					
						
						|  | self, | 
					
						
						|  | vocab_file=None, | 
					
						
						|  | merges_file=None, | 
					
						
						|  | tokenizer_file=None, | 
					
						
						|  | unk_token="<|endoftext|>", | 
					
						
						|  | bos_token="<|endoftext|>", | 
					
						
						|  | eos_token="<|endoftext|>", | 
					
						
						|  | pad_token="<|endoftext|>", | 
					
						
						|  | quantum_token="<|quantum|>", | 
					
						
						|  | classical_token="<|classical|>", | 
					
						
						|  | system_token="<|system|>", | 
					
						
						|  | user_token="<|user|>", | 
					
						
						|  | assistant_token="<|assistant|>", | 
					
						
						|  | add_prefix_space=False, | 
					
						
						|  | **kwargs | 
					
						
						|  | ): | 
					
						
						|  |  | 
					
						
						|  | vocab = { | 
					
						
						|  | "<|endoftext|>": 0, | 
					
						
						|  | "<|quantum|>": 1, | 
					
						
						|  | "<|classical|>": 2, | 
					
						
						|  | "<|system|>": 3, | 
					
						
						|  | "<|user|>": 4, | 
					
						
						|  | "<|assistant|>": 5, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"): | 
					
						
						|  | vocab[char] = i + 6 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self._vocab = vocab | 
					
						
						|  | self._ids_to_tokens = {v: k for k, v in vocab.items()} | 
					
						
						|  |  | 
					
						
						|  | super().__init__( | 
					
						
						|  | unk_token=unk_token, | 
					
						
						|  | bos_token=bos_token, | 
					
						
						|  | eos_token=eos_token, | 
					
						
						|  | pad_token=pad_token, | 
					
						
						|  | add_prefix_space=add_prefix_space, | 
					
						
						|  | **kwargs | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | self.quantum_token = quantum_token | 
					
						
						|  | self.classical_token = classical_token | 
					
						
						|  | self.system_token = system_token | 
					
						
						|  | self.user_token = user_token | 
					
						
						|  | self.assistant_token = assistant_token | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def vocab_size(self): | 
					
						
						|  | return len(self._vocab) | 
					
						
						|  |  | 
					
						
						|  | def get_vocab(self): | 
					
						
						|  | return dict(self._vocab) | 
					
						
						|  |  | 
					
						
						|  | def _tokenize(self, text): | 
					
						
						|  | """Basic tokenization - split by whitespace and characters""" | 
					
						
						|  | tokens = [] | 
					
						
						|  | current_token = "" | 
					
						
						|  |  | 
					
						
						|  | for char in text: | 
					
						
						|  | if char.isspace(): | 
					
						
						|  | if current_token: | 
					
						
						|  | tokens.append(current_token) | 
					
						
						|  | current_token = "" | 
					
						
						|  | else: | 
					
						
						|  | current_token += char | 
					
						
						|  |  | 
					
						
						|  | if current_token: | 
					
						
						|  | tokens.append(current_token) | 
					
						
						|  |  | 
					
						
						|  | return tokens | 
					
						
						|  |  | 
					
						
						|  | def _convert_token_to_id(self, token): | 
					
						
						|  | """Convert token to ID""" | 
					
						
						|  | return self._vocab.get(token, self._vocab[self.unk_token]) | 
					
						
						|  |  | 
					
						
						|  | def _convert_id_to_token(self, index): | 
					
						
						|  | """Convert ID to token""" | 
					
						
						|  | return self._ids_to_tokens.get(index, self.unk_token) | 
					
						
						|  |  | 
					
						
						|  | def convert_tokens_to_string(self, tokens): | 
					
						
						|  | """Convert tokens back to string""" | 
					
						
						|  | return " ".join(tokens) | 
					
						
						|  |  | 
					
						
						|  | def save_vocabulary(self, save_directory, filename_prefix=None): | 
					
						
						|  | """Save vocabulary to files""" | 
					
						
						|  | vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json" | 
					
						
						|  | vocab_path = f"{save_directory}/{vocab_file}" | 
					
						
						|  |  | 
					
						
						|  | with open(vocab_path, 'w') as f: | 
					
						
						|  | json.dump(self._vocab, f, indent=2) | 
					
						
						|  |  | 
					
						
						|  | return (vocab_path,) | 
					
						
						|  |  | 
					
						
						|  | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): | 
					
						
						|  | """Build input with special tokens for Ollama format""" | 
					
						
						|  | if token_ids_1 is None: | 
					
						
						|  | return token_ids_0 + [self.eos_token_id] | 
					
						
						|  | return token_ids_0 + token_ids_1 + [self.eos_token_id] | 
					
						
						|  |  | 
					
						
						|  | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): | 
					
						
						|  | """Get special tokens mask""" | 
					
						
						|  | if already_has_special_tokens: | 
					
						
						|  | return super().get_special_tokens_mask( | 
					
						
						|  | token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if token_ids_1 is not None: | 
					
						
						|  | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] | 
					
						
						|  | return [1] + ([0] * len(token_ids_0)) + [1] |