ollama / tokenization_ollama.py
tommytracx's picture
Add tokenization_ollama.py
9104d9c verified
raw
history blame
4.19 kB
"""
NeuralQuantum Ollama Tokenizer for Hugging Face Transformers
"""
import json
from typing import List, Optional, Union
from transformers import PreTrainedTokenizer
class NeuralQuantumOllamaTokenizer(PreTrainedTokenizer):
"""Tokenizer for NeuralQuantum Ollama model"""
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
quantum_token="<|quantum|>",
classical_token="<|classical|>",
system_token="<|system|>",
user_token="<|user|>",
assistant_token="<|assistant|>",
add_prefix_space=False,
**kwargs
):
# Simple vocabulary for demonstration
vocab = {
"<|endoftext|>": 0,
"<|quantum|>": 1,
"<|classical|>": 2,
"<|system|>": 3,
"<|user|>": 4,
"<|assistant|>": 5,
}
# Add basic vocabulary
for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"):
vocab[char] = i + 6
# Set vocab before calling super().__init__
self._vocab = vocab
self._ids_to_tokens = {v: k for k, v in vocab.items()}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
add_prefix_space=add_prefix_space,
**kwargs
)
self.quantum_token = quantum_token
self.classical_token = classical_token
self.system_token = system_token
self.user_token = user_token
self.assistant_token = assistant_token
@property
def vocab_size(self):
return len(self._vocab)
def get_vocab(self):
return dict(self._vocab)
def _tokenize(self, text):
"""Basic tokenization - split by whitespace and characters"""
tokens = []
current_token = ""
for char in text:
if char.isspace():
if current_token:
tokens.append(current_token)
current_token = ""
else:
current_token += char
if current_token:
tokens.append(current_token)
return tokens
def _convert_token_to_id(self, token):
"""Convert token to ID"""
return self._vocab.get(token, self._vocab[self.unk_token])
def _convert_id_to_token(self, index):
"""Convert ID to token"""
return self._ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Convert tokens back to string"""
return " ".join(tokens)
def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save vocabulary to files"""
vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
vocab_path = f"{save_directory}/{vocab_file}"
with open(vocab_path, 'w') as f:
json.dump(self._vocab, f, indent=2)
return (vocab_path,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""Build input with special tokens for Ollama format"""
if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id]
return token_ids_0 + token_ids_1 + [self.eos_token_id]
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""Get special tokens mask"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]