pretrain
Browse files
scripts/base_datasets.py
CHANGED
@@ -59,14 +59,38 @@ base_datasets = [
|
|
59 |
{'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
|
60 |
|
61 |
#
|
62 |
-
#
|
63 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
# 12.2 MB, 500,000
|
65 |
{'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
|
66 |
{'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
|
67 |
# 125 MB, 1,000,000
|
68 |
{'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
|
69 |
|
|
|
|
|
|
|
70 |
# 1.44 GB, 63,357
|
71 |
*[
|
72 |
{'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
|
@@ -80,11 +104,9 @@ base_datasets = [
|
|
80 |
#
|
81 |
# code
|
82 |
#
|
83 |
-
# 36.8 MB, 79,013
|
84 |
-
# Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
|
85 |
{'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
|
86 |
-
# 1.62 GB, 1,632,309
|
87 |
-
# Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
|
88 |
*[
|
89 |
{'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
|
90 |
for i in range(0, 100, 10)
|
@@ -102,6 +124,11 @@ base_datasets = [
|
|
102 |
#
|
103 |
# general knowledge
|
104 |
#
|
|
|
|
|
|
|
|
|
|
|
105 |
# 3.18 GB, 1,010,500 - uncompressed 6GB
|
106 |
*[
|
107 |
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
@@ -109,4 +136,26 @@ base_datasets = [
|
|
109 |
],
|
110 |
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
|
111 |
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
]
|
|
|
59 |
{'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
|
60 |
|
61 |
#
|
62 |
+
# math
|
63 |
#
|
64 |
+
# 7.1 MB, 400,000
|
65 |
+
*[
|
66 |
+
{'kind': 'base', 'path': 'garrethlee/simple-arithmetic-problems', 'name': name, 'split': split, 'format': lambda n: n['question'].strip() + ' ' + n['answer'].strip()}
|
67 |
+
for name in [
|
68 |
+
'very_easy', 'very_easy_use_commas',
|
69 |
+
'easy', 'easy_use_commas',
|
70 |
+
'medium', 'medium_use_commas',
|
71 |
+
'hard', 'hard_use_commas',
|
72 |
+
'very_hard', 'very_hard_use_commas',
|
73 |
+
]
|
74 |
+
for split in [
|
75 |
+
'int_add_train', 'int_add_test',
|
76 |
+
'float_add_train', 'float_add_test',
|
77 |
+
'int_subtract_train', 'int_subtract_test',
|
78 |
+
'float_subtract_train', 'float_subtract_test',
|
79 |
+
'int_multiply_train', 'int_multiply_test',
|
80 |
+
'float_multiply_train', 'float_multiply_test',
|
81 |
+
'int_divide_train', 'int_divide_test',
|
82 |
+
'float_divide_train', 'float_divide_test',
|
83 |
+
]
|
84 |
+
],
|
85 |
# 12.2 MB, 500,000
|
86 |
{'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
|
87 |
{'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
|
88 |
# 125 MB, 1,000,000
|
89 |
{'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
|
90 |
|
91 |
+
#
|
92 |
+
# stem
|
93 |
+
#
|
94 |
# 1.44 GB, 63,357
|
95 |
*[
|
96 |
{'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
|
|
|
104 |
#
|
105 |
# code
|
106 |
#
|
107 |
+
# 36.8 MB, 79,013 - Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
|
|
|
108 |
{'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
|
109 |
+
# 1.62 GB, 1,632,309 - Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
|
|
|
110 |
*[
|
111 |
{'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
|
112 |
for i in range(0, 100, 10)
|
|
|
124 |
#
|
125 |
# general knowledge
|
126 |
#
|
127 |
+
# 4.03 GB, 6,035,374
|
128 |
+
*[
|
129 |
+
{'kind': 'base', 'path': 'TAWGCreatology/en-wiki-paraphrased-cleaned', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['paraphrase']}
|
130 |
+
for i in range(0, 100, 5)
|
131 |
+
],
|
132 |
# 3.18 GB, 1,010,500 - uncompressed 6GB
|
133 |
*[
|
134 |
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
|
|
136 |
],
|
137 |
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
|
138 |
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
|
139 |
+
|
140 |
+
#
|
141 |
+
# light instructions
|
142 |
+
#
|
143 |
+
# 44.3 MB, 51,760
|
144 |
+
{'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
|
145 |
+
# 11 MB, 12,564
|
146 |
+
{'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
|
147 |
+
# 15.6 MB, 24,926
|
148 |
+
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
149 |
+
]
|
150 |
+
|
151 |
+
base_datasets = [
|
152 |
+
#
|
153 |
+
# light instructions
|
154 |
+
#
|
155 |
+
# 44.3 MB, 51,760
|
156 |
+
{'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
|
157 |
+
# 11 MB, 12,564
|
158 |
+
{'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
|
159 |
+
# 15.6 MB, 24,926
|
160 |
+
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
161 |
]
|
scripts/pretrain_base_model_0.yaml
CHANGED
@@ -79,13 +79,13 @@ train:
|
|
79 |
max_seq_length: 8193
|
80 |
|
81 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
82 |
-
tie_embeddings:
|
83 |
|
84 |
# (type: Optional[float], default: 1.0)
|
85 |
max_norm: 1.0
|
86 |
|
87 |
# (type: float, default: 4e-05)
|
88 |
-
min_lr:
|
89 |
|
90 |
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
91 |
eval:
|
@@ -130,7 +130,7 @@ eval:
|
|
130 |
optimizer:
|
131 |
class_path: sophia_opt.SophiaG
|
132 |
init_args:
|
133 |
-
lr:
|
134 |
betas:
|
135 |
- 0.965
|
136 |
- 0.99
|
|
|
79 |
max_seq_length: 8193
|
80 |
|
81 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
82 |
+
tie_embeddings: False
|
83 |
|
84 |
# (type: Optional[float], default: 1.0)
|
85 |
max_norm: 1.0
|
86 |
|
87 |
# (type: float, default: 4e-05)
|
88 |
+
min_lr: 1e-5
|
89 |
|
90 |
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
91 |
eval:
|
|
|
130 |
optimizer:
|
131 |
class_path: sophia_opt.SophiaG
|
132 |
init_args:
|
133 |
+
lr: 1e-3
|
134 |
betas:
|
135 |
- 0.965
|
136 |
- 0.99
|