pretrain

Browse files

Files changed (2) hide show

scripts/base_datasets.py +54 -5
scripts/pretrain_base_model_0.yaml +3 -3

scripts/base_datasets.py CHANGED Viewed

@@ -59,14 +59,38 @@ base_datasets = [
     {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
     #
-    # stem
     #
     # 12.2 MB, 500,000
     {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
     {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
     # 125 MB, 1,000,000
     {'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
     # 1.44 GB, 63,357
     *[
         {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
@@ -80,11 +104,9 @@ base_datasets = [
     #
     # code
     #
-    # 36.8 MB, 79,013
-    # Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
     {'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
-    # 1.62 GB, 1,632,309
-    # Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
     *[
         {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
         for i in range(0, 100, 10)
@@ -102,6 +124,11 @@ base_datasets = [
     #
     # general knowledge
     #
     # 3.18 GB, 1,010,500 - uncompressed 6GB
     *[
         {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
@@ -109,4 +136,26 @@ base_datasets = [
     ],
     {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
     {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
 ]

     {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
     #
+    # math
     #
+    # 7.1 MB,  400,000
+    *[
+        {'kind': 'base', 'path': 'garrethlee/simple-arithmetic-problems', 'name': name, 'split': split, 'format': lambda n: n['question'].strip() + ' ' + n['answer'].strip()}
+        for name in [
+            'very_easy', 'very_easy_use_commas',
+            'easy', 'easy_use_commas',
+            'medium', 'medium_use_commas',
+            'hard', 'hard_use_commas',
+            'very_hard', 'very_hard_use_commas',
+        ]
+        for split in [
+            'int_add_train', 'int_add_test',
+            'float_add_train', 'float_add_test',
+            'int_subtract_train', 'int_subtract_test',
+            'float_subtract_train', 'float_subtract_test',
+            'int_multiply_train', 'int_multiply_test',
+            'float_multiply_train', 'float_multiply_test',
+            'int_divide_train', 'int_divide_test',
+            'float_divide_train', 'float_divide_test',
+        ]
+    ],
     # 12.2 MB, 500,000
     {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
     {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
     # 125 MB, 1,000,000
     {'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
+    #
+    # stem
+    #
     # 1.44 GB, 63,357
     *[
         {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
     #
     # code
     #
+    # 36.8 MB, 79,013 - Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
     {'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
+    # 1.62 GB, 1,632,309 - Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
     *[
         {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
         for i in range(0, 100, 10)
     #
     # general knowledge
     #
+    # 4.03 GB, 6,035,374
+    *[
+        {'kind': 'base', 'path': 'TAWGCreatology/en-wiki-paraphrased-cleaned', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['paraphrase']}
+        for i in range(0, 100, 5)
+    ],
     # 3.18 GB, 1,010,500 - uncompressed 6GB
     *[
         {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
     ],
     {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
     {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
+    #
+    # light instructions
+    #
+    # 44.3 MB, 51,760
+    {'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
+    # 11 MB, 12,564
+    {'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
+    # 15.6 MB, 24,926
+    {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
+]
+base_datasets = [
+    #
+    # light instructions
+    #
+    # 44.3 MB, 51,760
+    {'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
+    # 11 MB, 12,564
+    {'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
+    # 15.6 MB, 24,926
+    {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
 ]

scripts/pretrain_base_model_0.yaml CHANGED Viewed

@@ -79,13 +79,13 @@ train:
   max_seq_length: 8193
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
-  tie_embeddings: false
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
   #   (type: float, default: 4e-05)
-  min_lr: 5e-5
 # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 eval:
@@ -130,7 +130,7 @@ eval:
 optimizer:
   class_path: sophia_opt.SophiaG
   init_args:
-    lr: 5e-4
     betas:
       - 0.965
       - 0.99

   max_seq_length: 8193
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
+  tie_embeddings: False
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
   #   (type: float, default: 4e-05)
+  min_lr: 1e-5
 # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 eval:
 optimizer:
   class_path: sophia_opt.SophiaG
   init_args:
+    lr: 1e-3
     betas:
       - 0.965
       - 0.99