""" NeuralQuantum Ollama Tokenizer for Hugging Face Transformers """ import json from typing import List, Optional, Union from transformers import PreTrainedTokenizer class NeuralQuantumOllamaTokenizer(PreTrainedTokenizer): """Tokenizer for NeuralQuantum Ollama model""" def __init__( self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", pad_token="<|endoftext|>", quantum_token="<|quantum|>", classical_token="<|classical|>", system_token="<|system|>", user_token="<|user|>", assistant_token="<|assistant|>", add_prefix_space=False, **kwargs ): # Simple vocabulary for demonstration vocab = { "<|endoftext|>": 0, "<|quantum|>": 1, "<|classical|>": 2, "<|system|>": 3, "<|user|>": 4, "<|assistant|>": 5, } # Add basic vocabulary for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"): vocab[char] = i + 6 # Set vocab before calling super().__init__ self._vocab = vocab self._ids_to_tokens = {v: k for k, v in vocab.items()} super().__init__( unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, add_prefix_space=add_prefix_space, **kwargs ) self.quantum_token = quantum_token self.classical_token = classical_token self.system_token = system_token self.user_token = user_token self.assistant_token = assistant_token @property def vocab_size(self): return len(self._vocab) def get_vocab(self): return dict(self._vocab) def _tokenize(self, text): """Basic tokenization - split by whitespace and characters""" tokens = [] current_token = "" for char in text: if char.isspace(): if current_token: tokens.append(current_token) current_token = "" else: current_token += char if current_token: tokens.append(current_token) return tokens def _convert_token_to_id(self, token): """Convert token to ID""" return self._vocab.get(token, self._vocab[self.unk_token]) def _convert_id_to_token(self, index): """Convert ID to token""" return self._ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """Convert tokens back to string""" return " ".join(tokens) def save_vocabulary(self, save_directory, filename_prefix=None): """Save vocabulary to files""" vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json" vocab_path = f"{save_directory}/{vocab_file}" with open(vocab_path, 'w') as f: json.dump(self._vocab, f, indent=2) return (vocab_path,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """Build input with special tokens for Ollama format""" if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] return token_ids_0 + token_ids_1 + [self.eos_token_id] def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """Get special tokens mask""" if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]