mtasic85 commited on
Commit
98fd0ab
·
1 Parent(s): cebb861
scripts/base_datasets.py CHANGED
@@ -59,14 +59,38 @@ base_datasets = [
59
  {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
60
 
61
  #
62
- # stem
63
  #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # 12.2 MB, 500,000
65
  {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
66
  {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
67
  # 125 MB, 1,000,000
68
  {'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
69
 
 
 
 
70
  # 1.44 GB, 63,357
71
  *[
72
  {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
@@ -80,11 +104,9 @@ base_datasets = [
80
  #
81
  # code
82
  #
83
- # 36.8 MB, 79,013
84
- # Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
85
  {'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
86
- # 1.62 GB, 1,632,309
87
- # Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
88
  *[
89
  {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
90
  for i in range(0, 100, 10)
@@ -102,6 +124,11 @@ base_datasets = [
102
  #
103
  # general knowledge
104
  #
 
 
 
 
 
105
  # 3.18 GB, 1,010,500 - uncompressed 6GB
106
  *[
107
  {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
@@ -109,4 +136,26 @@ base_datasets = [
109
  ],
110
  {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
111
  {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  ]
 
59
  {'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
60
 
61
  #
62
+ # math
63
  #
64
+ # 7.1 MB, 400,000
65
+ *[
66
+ {'kind': 'base', 'path': 'garrethlee/simple-arithmetic-problems', 'name': name, 'split': split, 'format': lambda n: n['question'].strip() + ' ' + n['answer'].strip()}
67
+ for name in [
68
+ 'very_easy', 'very_easy_use_commas',
69
+ 'easy', 'easy_use_commas',
70
+ 'medium', 'medium_use_commas',
71
+ 'hard', 'hard_use_commas',
72
+ 'very_hard', 'very_hard_use_commas',
73
+ ]
74
+ for split in [
75
+ 'int_add_train', 'int_add_test',
76
+ 'float_add_train', 'float_add_test',
77
+ 'int_subtract_train', 'int_subtract_test',
78
+ 'float_subtract_train', 'float_subtract_test',
79
+ 'int_multiply_train', 'int_multiply_test',
80
+ 'float_multiply_train', 'float_multiply_test',
81
+ 'int_divide_train', 'int_divide_test',
82
+ 'float_divide_train', 'float_divide_test',
83
+ ]
84
+ ],
85
  # 12.2 MB, 500,000
86
  {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
87
  {'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
88
  # 125 MB, 1,000,000
89
  {'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
90
 
91
+ #
92
+ # stem
93
+ #
94
  # 1.44 GB, 63,357
95
  *[
96
  {'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
 
104
  #
105
  # code
106
  #
107
+ # 36.8 MB, 79,013 - Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
 
108
  {'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
109
+ # 1.62 GB, 1,632,309 - Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
 
110
  *[
111
  {'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
112
  for i in range(0, 100, 10)
 
124
  #
125
  # general knowledge
126
  #
127
+ # 4.03 GB, 6,035,374
128
+ *[
129
+ {'kind': 'base', 'path': 'TAWGCreatology/en-wiki-paraphrased-cleaned', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['paraphrase']}
130
+ for i in range(0, 100, 5)
131
+ ],
132
  # 3.18 GB, 1,010,500 - uncompressed 6GB
133
  *[
134
  {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
 
136
  ],
137
  {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
138
  {'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
139
+
140
+ #
141
+ # light instructions
142
+ #
143
+ # 44.3 MB, 51,760
144
+ {'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
145
+ # 11 MB, 12,564
146
+ {'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
147
+ # 15.6 MB, 24,926
148
+ {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
149
+ ]
150
+
151
+ base_datasets = [
152
+ #
153
+ # light instructions
154
+ #
155
+ # 44.3 MB, 51,760
156
+ {'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
157
+ # 11 MB, 12,564
158
+ {'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
159
+ # 15.6 MB, 24,926
160
+ {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
161
  ]
scripts/pretrain_base_model_0.yaml CHANGED
@@ -79,13 +79,13 @@ train:
79
  max_seq_length: 8193
80
 
81
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
82
- tie_embeddings: false
83
 
84
  # (type: Optional[float], default: 1.0)
85
  max_norm: 1.0
86
 
87
  # (type: float, default: 4e-05)
88
- min_lr: 5e-5
89
 
90
  # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
91
  eval:
@@ -130,7 +130,7 @@ eval:
130
  optimizer:
131
  class_path: sophia_opt.SophiaG
132
  init_args:
133
- lr: 5e-4
134
  betas:
135
  - 0.965
136
  - 0.99
 
79
  max_seq_length: 8193
80
 
81
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
82
+ tie_embeddings: False
83
 
84
  # (type: Optional[float], default: 1.0)
85
  max_norm: 1.0
86
 
87
  # (type: float, default: 4e-05)
88
+ min_lr: 1e-5
89
 
90
  # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
91
  eval:
 
130
  optimizer:
131
  class_path: sophia_opt.SophiaG
132
  init_args:
133
+ lr: 1e-3
134
  betas:
135
  - 0.965
136
  - 0.99