tokenizer

Browse files

Files changed (10) hide show

scripts/{core_base_datasets.py → base_datasets.py} +1 -1
scripts/{core_instruct_datasets.py → base_instruct_datasets.py} +1 -1
scripts/{prepare_core_datasets.py → prepare_base_datasets.py} +14 -13
scripts/{pretrain_core_model_0.yaml → pretrain_base_model_0.yaml} +0 -0
scripts/train_tokenizer.py +3 -3
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +6 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +525 -0
tokenizer/vocab.json +0 -0

scripts/{core_base_datasets.py → base_datasets.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-core_base_datasets = [
     #
     # general
     #

+base_datasets = [
     #
     # general
     #

scripts/{core_instruct_datasets.py → base_instruct_datasets.py} RENAMED Viewed

@@ -26,7 +26,7 @@ Response Guidelines:
 - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
 - Maintain a professional, intelligent, and analytical tone in all interactions.'''
-core_instruct_datasets = [
     # 65.7 MB, 11,578
     # 1.89k
     {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'data_files': 'func-calling-singleturn.json', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [

 - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
 - Maintain a professional, intelligent, and analytical tone in all interactions.'''
+base_instruct_datasets = [
     # 65.7 MB, 11,578
     # 1.89k
     {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'data_files': 'func-calling-singleturn.json', 'split': 'train', 'field': 'conversations', 'transform': lambda msgs: [

scripts/{prepare_core_datasets.py → prepare_base_datasets.py} RENAMED Viewed

@@ -5,21 +5,22 @@ from litgpt.tokenizer import Tokenizer
 from litdata import optimize, TokensLoader, StreamingDataset
 from utils import tokenize_fn
-from core_base_datasets import core_base_datasets
-from core_instruct_datasets import core_instruct_datasets
 tokenizer_path = '../tokenizer'
 seqs = [
-    (0, 1073741824, 1025, 16000),
-    (1025, 2049, 2049, 8000),
-    (2049, 4097, 4097, 4000),
-    (4097, 8193, 8193, 2000),
-    (8193, 16385, 16385, 1000),
-    (16385, 32769, 32769, 500),
-    (32769, 65537, 65537, 250),
-    (65537, 131073, 131073, 125),
 ]
 #
@@ -27,7 +28,7 @@ seqs = [
 #
 for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
     chunk_size = block_size * subchunk_size
-    output_dir = f'../core-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}'
     outputs = optimize(
         fn=partial(
@@ -37,7 +38,7 @@ for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
             hf_tokenizer=AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, use_fast=True),
             tokenizer=Tokenizer(tokenizer_path),
         ),
-        inputs=core_base_datasets + core_instruct_datasets,
         output_dir=output_dir,
         chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
         num_workers=32,
@@ -52,7 +53,7 @@ for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
 #
 for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
     chunk_size = block_size * subchunk_size
-    input_dir = f'../core-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}'
     dataset = StreamingDataset(
         input_dir=input_dir,

 from litdata import optimize, TokensLoader, StreamingDataset
 from utils import tokenize_fn
+from base_datasets import base_datasets
+from base_instruct_datasets import base_instruct_datasets
 tokenizer_path = '../tokenizer'
 seqs = [
+    # (0, 1073741824, 1025, 16000),
+    # (1025, 2049, 2049, 8000),
+    # (2049, 4097, 4097, 4000),
+    # (4097, 8193, 8193, 2000),
+    # (8193, 16385, 16385, 1000),
+    # (16385, 32769, 32769, 500),
+    # (32769, 65537, 65537, 250),
+    # (65537, 131073, 131073, 125),
+    (0, 1073741824, 8193, 2000),
 ]
 #
 #
 for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
     chunk_size = block_size * subchunk_size
+    output_dir = f'../base-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}'
     outputs = optimize(
         fn=partial(
             hf_tokenizer=AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, use_fast=True),
             tokenizer=Tokenizer(tokenizer_path),
         ),
+        inputs=base_datasets + base_instruct_datasets,
         output_dir=output_dir,
         chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
         num_workers=32,
 #
 for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
     chunk_size = block_size * subchunk_size
+    input_dir = f'../base-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}'
     dataset = StreamingDataset(
         input_dir=input_dir,

scripts/{pretrain_core_model_0.yaml → pretrain_base_model_0.yaml} RENAMED Viewed

File without changes

scripts/train_tokenizer.py CHANGED Viewed

@@ -7,8 +7,8 @@ from tokenizers.models import BPE
 from tokenizers.trainers import BpeTrainer
 from utils import batch_dataset_iterator
-from core_base_datasets import core_base_datasets
-from core_instruct_datasets import core_instruct_datasets
 tokenizer_path = '../tokenizer'
@@ -83,7 +83,7 @@ trainer = BpeTrainer(
     max_token_length=16,
 )
-tokenizer_datasets = core_base_datasets + core_instruct_datasets
 tokenizer.train_from_iterator(
     (batch_dataset_iterator(n) for n in tokenizer_datasets),

 from tokenizers.trainers import BpeTrainer
 from utils import batch_dataset_iterator
+from base_datasets import base_datasets
+from base_instruct_datasets import base_instruct_datasets
 tokenizer_path = '../tokenizer'
     max_token_length=16,
 )
+tokenizer_datasets = base_datasets + base_instruct_datasets
 tokenizer.train_from_iterator(
     (batch_dataset_iterator(n) for n in tokenizer_datasets),

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|im_end|>",
+  "pad_token": "<|pad|>",
+  "unk_token": "<|unk|>"
+}

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:397bea157cd79c7c5b24406e9dabf987c80ef2f148dea76ef8123a64f621933d
+size 4718469

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,525 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<|im_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "system",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "user",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "assistant",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "</tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<tool>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "</tool>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<question>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "</question>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "</answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<|reserved_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<|reserved_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<|reserved_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<|reserved_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<|reserved_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<|reserved_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<|reserved_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "<|reserved_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<|reserved_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<|reserved_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<|reserved_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<|reserved_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<|reserved_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<|reserved_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "37": {
+      "content": "<|reserved_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "38": {
+      "content": "<|reserved_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39": {
+      "content": "<|reserved_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40": {
+      "content": "<|reserved_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41": {
+      "content": "<|reserved_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42": {
+      "content": "<|reserved_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "43": {
+      "content": "<|reserved_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "44": {
+      "content": "<|reserved_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "<|reserved_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|reserved_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "47": {
+      "content": "<|reserved_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|reserved_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49": {
+      "content": "<|reserved_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|reserved_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "51": {
+      "content": "<|reserved_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|reserved_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "53": {
+      "content": "<|reserved_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "54": {
+      "content": "<|reserved_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "55": {
+      "content": "<|reserved_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "56": {
+      "content": "<|reserved_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57": {
+      "content": "<|reserved_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "58": {
+      "content": "<|reserved_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59": {
+      "content": "<|reserved_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "60": {
+      "content": "<|reserved_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "61": {
+      "content": "<|reserved_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "62": {
+      "content": "<|reserved_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "63": {
+      "content": "<|reserved_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|pad|>",
+  "tokenizer_class": "PreTrainedTokenizer",
+  "unk_token": "<|unk|>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff