Corianas commited on
Commit
83f012f
·
verified ·
1 Parent(s): 33e001e

Upload build_shift_char_tokenizer.py

Browse files
Files changed (1) hide show
  1. src/build_shift_char_tokenizer.py +139 -0
src/build_shift_char_tokenizer.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_shift_char_tokenizer.py
2
+ import json
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ from tokenizers.models import WordLevel
7
+ from tokenizers.pre_tokenizers import Split
8
+ from tokenizers.normalizers import Sequence, Replace, Lowercase, NFKC
9
+ from tokenizers.processors import TemplateProcessing
10
+ from transformers import PreTrainedTokenizerFast
11
+
12
+ from tokenizers import Tokenizer, Regex, decoders
13
+ from tokenizers.models import WordLevel
14
+ from tokenizers.pre_tokenizers import Split
15
+
16
+ def build_shift_char_tokenizer(
17
+ out_dir: str,
18
+ base_tokens: List[str],
19
+ *,
20
+ shift_token: str = "↨",
21
+ special_tokens: List[str] = ("<pad>", "<unk>", "<bos>", "<eos>"),
22
+ include_specials_in_128: bool = True,
23
+ ):
24
+ """
25
+ Create a HF-compatible char tokenizer with SHIFT+lowercase behavior.
26
+ - base_tokens: your full 128-token alphabet if include_specials_in_128=True,
27
+ otherwise your 128 data tokens and we’ll append specials (vocab will be >128).
28
+ - shift_token must be present in base_tokens.
29
+ """
30
+ out = Path(out_dir)
31
+ out.mkdir(parents=True, exist_ok=True)
32
+
33
+ # Validate vocab sizing
34
+ base_set = list(dict.fromkeys(base_tokens)) # keep order, dedupe
35
+ if base_set != base_tokens:
36
+ raise ValueError(f"base_tokens has duplicates; order must define ids. Should be {base_tokens} but is {base_set}")
37
+
38
+ if shift_token not in base_tokens:
39
+ raise ValueError(f"'{shift_token}' must be in base_tokens.")
40
+
41
+ if include_specials_in_128:
42
+ # specials must already be present in base_tokens
43
+ missing = [t for t in special_tokens if t not in base_tokens]
44
+ if missing:
45
+ raise ValueError(f"special tokens missing from base_tokens: {missing}")
46
+ if len(base_tokens) != 128:
47
+ raise ValueError(f"base_tokens must be exactly 128 when include_specials_in_128=True (got {len(base_tokens)}).")
48
+ vocab_tokens = base_tokens
49
+ else:
50
+ # append specials; vocab_size will exceed 128
51
+ vocab_tokens = base_tokens + [t for t in special_tokens if t not in base_tokens]
52
+
53
+ # Build vocab mapping
54
+ token_to_id = {tok: i for i, tok in enumerate(vocab_tokens)}
55
+ unk_token = "<unk>" if "<unk>" in token_to_id else None
56
+
57
+ # Model: fixed WordLevel
58
+ model = WordLevel(vocab=token_to_id, unk_token=unk_token)
59
+
60
+ # Explicit uppercase mapping avoids backref issues
61
+ uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
62
+ normalizer_steps = [NFKC()]
63
+ for u in uppercase:
64
+ normalizer_steps.append(Replace(Regex(u), SHIFT + u.lower()))
65
+ normalizer = Sequence(normalizer_steps)
66
+
67
+ # Pre-tokenizer: isolate every codepoint, including newlines (use DOTALL)
68
+ #pre_tok = Split(Regex(r"(?s)."), behavior="isolated")
69
+ pre_tok = Split(Regex(r"\X"), behavior="isolated")
70
+ tok = Tokenizer(model)
71
+ tok.normalizer = normalizer
72
+ tok.pre_tokenizer = pre_tok
73
+
74
+ tok.decoder = decoders.Sequence([]) # concatenate tokens verbatim
75
+ # Optional: tidy BOS/EOS on encode if you want them
76
+ # (kept minimal; models often add these themselves)
77
+ if "<bos>" in token_to_id and "<eos>" in token_to_id:
78
+ tok.post_processor = TemplateProcessing(
79
+ single="$0",
80
+ pair="$A $B",
81
+ special_tokens=[
82
+ # add e.g. ("<bos>", id), ("<eos>", id) here if you want automatic wrapping
83
+ ],
84
+ )
85
+
86
+ # Wrap in HF fast tokenizer and save
87
+ hf_tok = PreTrainedTokenizerFast(
88
+ tokenizer_object=tok,
89
+ bos_token="<bos>" if "<bos>" in token_to_id else None,
90
+ eos_token="<eos>" if "<eos>" in token_to_id else None,
91
+ unk_token=unk_token,
92
+ pad_token="<pad>" if "<pad>" in token_to_id else None,
93
+ )
94
+
95
+ # metadata for HF
96
+ tokenizer_config = {
97
+ "model_max_length": 1024, # adjust for your use case
98
+ }
99
+ (Path(out_dir) / "tokenizer_config.json").write_text(json.dumps(tokenizer_config, indent=2), encoding="utf-8")
100
+
101
+ hf_tok.save_pretrained(out_dir)
102
+ print(f"Saved tokenizer to: {out_dir}")
103
+ print(f"Vocab size: {len(vocab_tokens)} (include_specials_in_128={include_specials_in_128})")
104
+
105
+ if __name__ == "__main__":
106
+ # Example: define your exact 128 tokens including specials and SHIFT.
107
+ # Keep ordering stable; ids are index positions.
108
+ # Below is a sane template to edit. Make sure length == 128.
109
+ SHIFT = "↨"
110
+ specials = ["<pad>", "<unk>", "<bos>", "<eos>"]
111
+
112
+ # Base character set (edit this list to be exactly 124 non-specials + 4 specials = 128)
113
+ chars = list("\n\t ") # newline, tab, space
114
+ chars += list("0123456789")
115
+ chars += list("abcdefghijklmnopqrstuvwxyz")
116
+ # Include punctuation/symbols you need. Keep only what you’ll actually see.
117
+ chars += list("\"!$&'#,/+=-<>*@.:;[]{}()^_?") # from your sample
118
+ chars += list("èé") # sample diacritics you mentioned
119
+ # Add SHIFT token
120
+ # Ensure NO uppercase letters are in the vocab (they’re represented via SHIFT+lowercase)
121
+ base_tokens_wo_specials = [SHIFT] + chars
122
+
123
+ # If you want exactly 128 including specials, adjust to 124 data tokens + 4 specials
124
+ # Add or remove symbols to hit 124 before specials:
125
+ # Pad with rarely-used placeholders if needed:
126
+ while len(base_tokens_wo_specials) < 124:
127
+ base_tokens_wo_specials.append(f"¤{len(base_tokens_wo_specials)}") # harmless placeholders
128
+ if len(base_tokens_wo_specials) != 124:
129
+ raise SystemExit(f"Currently have {len(base_tokens_wo_specials)} data tokens; adjust to 124 before specials.")
130
+
131
+ base_tokens_including_specials = specials + base_tokens_wo_specials # specials first is fine
132
+
133
+ build_shift_char_tokenizer(
134
+ out_dir="char128_shift_tokenizer",
135
+ base_tokens=base_tokens_including_specials,
136
+ shift_token=SHIFT,
137
+ special_tokens=specials,
138
+ include_specials_in_128=True,
139
+ )