Upload build_shift_char_tokenizer.py
Browse files
src/build_shift_char_tokenizer.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# build_shift_char_tokenizer.py
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
from tokenizers.models import WordLevel
|
7 |
+
from tokenizers.pre_tokenizers import Split
|
8 |
+
from tokenizers.normalizers import Sequence, Replace, Lowercase, NFKC
|
9 |
+
from tokenizers.processors import TemplateProcessing
|
10 |
+
from transformers import PreTrainedTokenizerFast
|
11 |
+
|
12 |
+
from tokenizers import Tokenizer, Regex, decoders
|
13 |
+
from tokenizers.models import WordLevel
|
14 |
+
from tokenizers.pre_tokenizers import Split
|
15 |
+
|
16 |
+
def build_shift_char_tokenizer(
|
17 |
+
out_dir: str,
|
18 |
+
base_tokens: List[str],
|
19 |
+
*,
|
20 |
+
shift_token: str = "↨",
|
21 |
+
special_tokens: List[str] = ("<pad>", "<unk>", "<bos>", "<eos>"),
|
22 |
+
include_specials_in_128: bool = True,
|
23 |
+
):
|
24 |
+
"""
|
25 |
+
Create a HF-compatible char tokenizer with SHIFT+lowercase behavior.
|
26 |
+
- base_tokens: your full 128-token alphabet if include_specials_in_128=True,
|
27 |
+
otherwise your 128 data tokens and we’ll append specials (vocab will be >128).
|
28 |
+
- shift_token must be present in base_tokens.
|
29 |
+
"""
|
30 |
+
out = Path(out_dir)
|
31 |
+
out.mkdir(parents=True, exist_ok=True)
|
32 |
+
|
33 |
+
# Validate vocab sizing
|
34 |
+
base_set = list(dict.fromkeys(base_tokens)) # keep order, dedupe
|
35 |
+
if base_set != base_tokens:
|
36 |
+
raise ValueError(f"base_tokens has duplicates; order must define ids. Should be {base_tokens} but is {base_set}")
|
37 |
+
|
38 |
+
if shift_token not in base_tokens:
|
39 |
+
raise ValueError(f"'{shift_token}' must be in base_tokens.")
|
40 |
+
|
41 |
+
if include_specials_in_128:
|
42 |
+
# specials must already be present in base_tokens
|
43 |
+
missing = [t for t in special_tokens if t not in base_tokens]
|
44 |
+
if missing:
|
45 |
+
raise ValueError(f"special tokens missing from base_tokens: {missing}")
|
46 |
+
if len(base_tokens) != 128:
|
47 |
+
raise ValueError(f"base_tokens must be exactly 128 when include_specials_in_128=True (got {len(base_tokens)}).")
|
48 |
+
vocab_tokens = base_tokens
|
49 |
+
else:
|
50 |
+
# append specials; vocab_size will exceed 128
|
51 |
+
vocab_tokens = base_tokens + [t for t in special_tokens if t not in base_tokens]
|
52 |
+
|
53 |
+
# Build vocab mapping
|
54 |
+
token_to_id = {tok: i for i, tok in enumerate(vocab_tokens)}
|
55 |
+
unk_token = "<unk>" if "<unk>" in token_to_id else None
|
56 |
+
|
57 |
+
# Model: fixed WordLevel
|
58 |
+
model = WordLevel(vocab=token_to_id, unk_token=unk_token)
|
59 |
+
|
60 |
+
# Explicit uppercase mapping avoids backref issues
|
61 |
+
uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
62 |
+
normalizer_steps = [NFKC()]
|
63 |
+
for u in uppercase:
|
64 |
+
normalizer_steps.append(Replace(Regex(u), SHIFT + u.lower()))
|
65 |
+
normalizer = Sequence(normalizer_steps)
|
66 |
+
|
67 |
+
# Pre-tokenizer: isolate every codepoint, including newlines (use DOTALL)
|
68 |
+
#pre_tok = Split(Regex(r"(?s)."), behavior="isolated")
|
69 |
+
pre_tok = Split(Regex(r"\X"), behavior="isolated")
|
70 |
+
tok = Tokenizer(model)
|
71 |
+
tok.normalizer = normalizer
|
72 |
+
tok.pre_tokenizer = pre_tok
|
73 |
+
|
74 |
+
tok.decoder = decoders.Sequence([]) # concatenate tokens verbatim
|
75 |
+
# Optional: tidy BOS/EOS on encode if you want them
|
76 |
+
# (kept minimal; models often add these themselves)
|
77 |
+
if "<bos>" in token_to_id and "<eos>" in token_to_id:
|
78 |
+
tok.post_processor = TemplateProcessing(
|
79 |
+
single="$0",
|
80 |
+
pair="$A $B",
|
81 |
+
special_tokens=[
|
82 |
+
# add e.g. ("<bos>", id), ("<eos>", id) here if you want automatic wrapping
|
83 |
+
],
|
84 |
+
)
|
85 |
+
|
86 |
+
# Wrap in HF fast tokenizer and save
|
87 |
+
hf_tok = PreTrainedTokenizerFast(
|
88 |
+
tokenizer_object=tok,
|
89 |
+
bos_token="<bos>" if "<bos>" in token_to_id else None,
|
90 |
+
eos_token="<eos>" if "<eos>" in token_to_id else None,
|
91 |
+
unk_token=unk_token,
|
92 |
+
pad_token="<pad>" if "<pad>" in token_to_id else None,
|
93 |
+
)
|
94 |
+
|
95 |
+
# metadata for HF
|
96 |
+
tokenizer_config = {
|
97 |
+
"model_max_length": 1024, # adjust for your use case
|
98 |
+
}
|
99 |
+
(Path(out_dir) / "tokenizer_config.json").write_text(json.dumps(tokenizer_config, indent=2), encoding="utf-8")
|
100 |
+
|
101 |
+
hf_tok.save_pretrained(out_dir)
|
102 |
+
print(f"Saved tokenizer to: {out_dir}")
|
103 |
+
print(f"Vocab size: {len(vocab_tokens)} (include_specials_in_128={include_specials_in_128})")
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
# Example: define your exact 128 tokens including specials and SHIFT.
|
107 |
+
# Keep ordering stable; ids are index positions.
|
108 |
+
# Below is a sane template to edit. Make sure length == 128.
|
109 |
+
SHIFT = "↨"
|
110 |
+
specials = ["<pad>", "<unk>", "<bos>", "<eos>"]
|
111 |
+
|
112 |
+
# Base character set (edit this list to be exactly 124 non-specials + 4 specials = 128)
|
113 |
+
chars = list("\n\t ") # newline, tab, space
|
114 |
+
chars += list("0123456789")
|
115 |
+
chars += list("abcdefghijklmnopqrstuvwxyz")
|
116 |
+
# Include punctuation/symbols you need. Keep only what you’ll actually see.
|
117 |
+
chars += list("\"!$&'#,/+=-<>*@.:;[]{}()^_?") # from your sample
|
118 |
+
chars += list("èé") # sample diacritics you mentioned
|
119 |
+
# Add SHIFT token
|
120 |
+
# Ensure NO uppercase letters are in the vocab (they’re represented via SHIFT+lowercase)
|
121 |
+
base_tokens_wo_specials = [SHIFT] + chars
|
122 |
+
|
123 |
+
# If you want exactly 128 including specials, adjust to 124 data tokens + 4 specials
|
124 |
+
# Add or remove symbols to hit 124 before specials:
|
125 |
+
# Pad with rarely-used placeholders if needed:
|
126 |
+
while len(base_tokens_wo_specials) < 124:
|
127 |
+
base_tokens_wo_specials.append(f"¤{len(base_tokens_wo_specials)}") # harmless placeholders
|
128 |
+
if len(base_tokens_wo_specials) != 124:
|
129 |
+
raise SystemExit(f"Currently have {len(base_tokens_wo_specials)} data tokens; adjust to 124 before specials.")
|
130 |
+
|
131 |
+
base_tokens_including_specials = specials + base_tokens_wo_specials # specials first is fine
|
132 |
+
|
133 |
+
build_shift_char_tokenizer(
|
134 |
+
out_dir="char128_shift_tokenizer",
|
135 |
+
base_tokens=base_tokens_including_specials,
|
136 |
+
shift_token=SHIFT,
|
137 |
+
special_tokens=specials,
|
138 |
+
include_specials_in_128=True,
|
139 |
+
)
|