Yinuo Zhang commited on
Commit
d79b4f8
·
1 Parent(s): 724b6d6

Add large file using Git LFS

Browse files
Files changed (4) hide show
  1. configs/config.yaml +171 -0
  2. configs/path.yaml +7 -0
  3. main.py +27 -26
  4. peptune-pretrained.ckpt +3 -0
configs/config.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - path
3
+
4
+ noise:
5
+ type: loglinear
6
+ sigma_min: 1e-4
7
+ sigma_max: 20
8
+ state_dependent: True
9
+
10
+ mode: ppl_eval # train / ppl_eval / sample_eval
11
+ diffusion: absorbing_state
12
+ vocab: old_smiles # old_smiles / new_smiles / selfies / helm
13
+ backbone: roformer # peptideclm / helmgpt / dit / roformer / finetune_roformer
14
+ parameterization: subs # subs
15
+ time_conditioning: False
16
+ T: 0 # 0 (continuous time) / 1000
17
+ subs_masking: False
18
+
19
+ seed: 42
20
+
21
+ mcts:
22
+ num_children: 50
23
+ num_objectives: 5
24
+ topk: 100
25
+ mask_token: 4
26
+ num_iter: 128
27
+ sampling: 0 # 0 is gumbel sampling / > 0 samples children from top k probs
28
+ invalid_penalty: 0.5
29
+ sample_prob: 1.0
30
+ perm: True
31
+ dual: False
32
+ single: False
33
+ time_dependent: True
34
+
35
+ lr_scheduler:
36
+ _target_: transformers.get_constant_schedule_with_warmup
37
+ num_warmup_steps: 2500
38
+
39
+ data:
40
+ train: ${paths.data}/finetune2/30K-train.csv
41
+ valid: ${paths.data}/finetune2/30K-val.csv
42
+ batchinohup ng: wrapping # padding / wrapping
43
+
44
+ loader:
45
+ global_batch_size: 64
46
+ eval_global_batch_size: ${.global_batch_size}
47
+ # Note: batch_size and eval_batch_size are **per machine**
48
+ batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
49
+ eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
50
+ num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
51
+ pin_memory: True
52
+
53
+ sampling:
54
+ predictor: ddpm_cache # analytic, ddpm, ddpm_cache
55
+ num_sequences: 100
56
+ sampling_eps: 1e-3
57
+ steps: 128
58
+ seq_length: 100
59
+ noise_removal: True
60
+ num_sample_batches: 2 # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
61
+ num_sample_log: 2
62
+ stride_length: 1
63
+ num_strides: 1
64
+
65
+ training:
66
+ antithetic_sampling: True
67
+ sampling_eps: 1e-3
68
+ focus_mask: False
69
+ #dynamic_batching: True
70
+ accumulator: False
71
+
72
+ eval:
73
+ checkpoint_path: ${paths.checkpoints}/11M-old-tokenizer/epoch=10-step=156276.ckpt
74
+ disable_ema: False
75
+ compute_generative_perplexity: False
76
+ perplexity_batch_size: 8
77
+ compute_perplexity_on_sanity: False
78
+ gen_ppl_eval_model_name_or_path: gpt2-large # gpt2-large, meta-llama/Llama-2-7b-hf
79
+ generate_samples: True
80
+ generation_model: ${paths.checkpoints}/11M-old-tokenizer/
81
+
82
+ optim:
83
+ weight_decay: 0.075
84
+ lr: 3e-4
85
+ beta1: 0.9
86
+ beta2: 0.999
87
+ eps: 1e-8
88
+
89
+ pepclm:
90
+ hidden_size: 768
91
+ cond_dim: 256
92
+ n_heads: 20
93
+ n_blocks: 4
94
+ dropout: 0.5
95
+ length: 512
96
+ #scale_by_sigma: True
97
+
98
+ model:
99
+ type: ddit
100
+ hidden_size: 768
101
+ cond_dim: 128
102
+ length: 512
103
+ n_blocks: 12
104
+ n_heads: 12
105
+ scale_by_sigma: True
106
+ dropout: 0.1
107
+
108
+ roformer:
109
+ hidden_size: 768
110
+ n_layers: 8
111
+ n_heads: 8
112
+ max_position_embeddings: 1035
113
+
114
+ helmgpt:
115
+ hidden_size: 256
116
+ embd_pdrop: 0.1
117
+ resid_pdrop: 0.1
118
+ attn_pdrop: 0.1
119
+ ff_dropout: 0.
120
+ block_size: 140
121
+ n_layer: 8
122
+ n_heads: 8
123
+
124
+
125
+ trainer:
126
+ _target_: lightning.Trainer
127
+ accelerator: cuda
128
+ num_nodes: 1
129
+ devices: ${device_count:}
130
+ accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
131
+ gradient_clip_val: 1.0
132
+ precision: 64-true
133
+ num_sanity_val_steps: 2
134
+ max_epochs: 100
135
+ max_steps: 1_000_000
136
+ log_every_n_steps: 10
137
+ limit_train_batches: 1.0 # train on full dataset, can be used to toggle quick run
138
+ limit_val_batches: 1.0 # validate on full dataset, can be used to toggle quick run
139
+ #val_check_interval: 40 #954
140
+ check_val_every_n_epoch: 1
141
+
142
+
143
+ wandb:
144
+ project: ${env_or:WANDB_PROJECT,peptune}
145
+ notes: null
146
+ group: null
147
+ job_type: null
148
+ name: ${env_or:WANDB_RUN_NAME,local}
149
+ id: ${.name}
150
+
151
+ hydra:
152
+ run:
153
+ dir: ./${now:%Y.%m.%d}/
154
+ job:
155
+ chdir: True
156
+
157
+ checkpointing:
158
+ # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
159
+ save_dir: ${paths.outputs}
160
+ # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
161
+ resume_from_ckpt: True
162
+ resume_ckpt_path: ${paths.checkpoints}/last.ckpt
163
+
164
+ callbacks:
165
+ model_checkpoint:
166
+ _target_: pytorch_lightning.callbacks.ModelCheckpoint
167
+ every_n_epochs: 1
168
+ monitor: "val/nll"
169
+ save_top_k: 10
170
+ mode: "min"
171
+ dirpath: ${paths.checkpoints}/11M-old-tokenizer
configs/path.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ paths:
2
+ base: ${cwd:} # repo root
3
+ data: ${paths.base}/data
4
+ checkpoints: ${paths.base}/checkpoints
5
+ tokenizers: ${paths.base}/tokenizers
6
+ outputs: ${paths.base}/outputs
7
+
main.py CHANGED
@@ -16,7 +16,6 @@ import torch
16
  import sys
17
  import torch.distributed as dist
18
  from torch.nn.parallel import DistributedDataParallel as DDP
19
- sys.path.append("/home/st512/peptune/scripts/peptide-mdlm-mcts")
20
 
21
  import dataset as dataloader
22
  import dataloading_for_dynamic_batching as dynamic_dataloader
@@ -30,24 +29,25 @@ from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
30
  from helm_tokenizer.helm_tokenizer import HelmTokenizer
31
 
32
 
33
- #wandb.login(key="5a7613c531cb58f9802f3f8e2f73bc4997b917ab")
34
-
35
  omegaconf.OmegaConf.register_new_resolver('cwd', os.getcwd)
36
  omegaconf.OmegaConf.register_new_resolver('device_count', torch.cuda.device_count)
37
  omegaconf.OmegaConf.register_new_resolver('eval', eval)
38
  omegaconf.OmegaConf.register_new_resolver('div_up', lambda x, y: (x + y - 1) // y)
 
39
 
40
  def _load_from_checkpoint(config, tokenizer):
41
- if 'hf' in config.backbone:
42
- return Diffusion(
43
- config, tokenizer=tokenizer).to('cuda')
44
- else:
45
- model = Diffusion.load_from_checkpoint(
46
- config.eval.checkpoint_path,
47
- tokenizer=tokenizer,
48
- config=config)
49
-
50
- return model
 
 
51
 
52
  @L.pytorch.utilities.rank_zero_only
53
  def print_config(
@@ -197,36 +197,37 @@ def _train(config, logger, tokenizer, data_module):
197
 
198
  model = Diffusion(config, tokenizer=tokenizer)
199
 
200
- if config.backbone == 'finetune_roformer':
201
- checkpoint = torch.load('/home/st512/peptune/scripts/peptide-mdlm-mcts/checkpoints/11M-old-tokenizer/epoch=1-step=24080.ckpt')
202
- model.load_state_dict(checkpoint['state_dict'])
 
203
 
204
  trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
205
 
206
 
207
- @hydra.main(version_base=None, config_path='/home/st512/peptune/scripts/peptide-mdlm-mcts', config_name='config')
208
  def main(config):
209
  """
210
  Main entry point for training
211
  """
212
- wandb.init(project="peptune")
213
  L.seed_everything(config.seed)
214
 
215
  # print_config(config, resolve=True, save_cfg=True)
216
 
217
  logger = utils.get_logger(__name__)
218
  # load PeptideCLM tokenizer
219
- if config.vocab == 'new_smiles':
 
220
  tokenizer = APETokenizer()
221
- tokenizer.load_vocabulary('/home/st512/peptune/scripts/peptide-mdlm-mcts/new_tokenizer/peptide_smiles_600_vocab.json')
222
  elif config.vocab == 'old_smiles':
223
- tokenizer = SMILES_SPE_Tokenizer('/home/st512/peptune/scripts/peptide-mdlm-mcts/tokenizer/new_vocab.txt',
224
- '/home/st512/peptune/scripts/peptide-mdlm-mcts/tokenizer/new_splits.txt')
225
  elif config.vocab == 'selfies':
226
  tokenizer = APETokenizer()
227
- tokenizer.load_vocabulary('/home/st512/peptune/scripts/peptide-mdlm-mcts/new_tokenizer/peptide_selfies_600_vocab.json')
228
  elif config.vocab == 'helm':
229
- tokenizer = HelmTokenizer('/home/st512/peptune/scripts/peptide-mdlm-mcts/helm_tokenizer/monomer_vocab.txt')
230
 
231
  if config.backbone == 'finetune_roformer':
232
  train_dataset = load_dataset('csv', data_files=config.data.train)
@@ -236,7 +237,7 @@ def main(config):
236
  val_dataset = val_dataset['train']#.select(lst)
237
  data_module = dataloader.CustomDataModule(train_dataset, val_dataset, None, tokenizer, batch_size=config.loader.global_batch_size)
238
  else:
239
- data_module = dynamic_dataloader.CustomDataModule('/home/st512/peptune/scripts/peptide-mdlm-mcts/data/smiles/11M_smiles_old_tokenizer_no_limit', tokenizer)
240
 
241
  if config.mode == 'sample_eval':
242
  generate_samples(config, logger, tokenizer)
@@ -247,4 +248,4 @@ def main(config):
247
 
248
 
249
  if __name__ == '__main__':
250
- main()
 
16
  import sys
17
  import torch.distributed as dist
18
  from torch.nn.parallel import DistributedDataParallel as DDP
 
19
 
20
  import dataset as dataloader
21
  import dataloading_for_dynamic_batching as dynamic_dataloader
 
29
  from helm_tokenizer.helm_tokenizer import HelmTokenizer
30
 
31
 
 
 
32
  omegaconf.OmegaConf.register_new_resolver('cwd', os.getcwd)
33
  omegaconf.OmegaConf.register_new_resolver('device_count', torch.cuda.device_count)
34
  omegaconf.OmegaConf.register_new_resolver('eval', eval)
35
  omegaconf.OmegaConf.register_new_resolver('div_up', lambda x, y: (x + y - 1) // y)
36
+ omegaconf.OmegaConf.register_new_resolver("env_or", lambda k, d: os.getenv(k, d))
37
 
38
  def _load_from_checkpoint(config, tokenizer):
39
+ """Create Diffusion model; load weights if checkpoint_path is set."""
40
+ if "hf" in str(config.get("backbone", "")):
41
+ return Diffusion(config, tokenizer=tokenizer).to("cuda")
42
+
43
+ ckpt_path = config.eval.checkpoint_path
44
+ model = Diffusion.load_from_checkpoint(
45
+ ckpt_path,
46
+ tokenizer=tokenizer,
47
+ config=config,
48
+ map_location="cuda" if torch.cuda.is_available() else "cpu",
49
+ )
50
+ return model
51
 
52
  @L.pytorch.utilities.rank_zero_only
53
  def print_config(
 
197
 
198
  model = Diffusion(config, tokenizer=tokenizer)
199
 
200
+ if config.backbone == 'finetune_roformer' and config.eval.checkpoint_path:
201
+ checkpoint = torch.load(config.eval.checkpoint_path, map_location="cpu")
202
+ state = checkpoint.get("state_dict", checkpoint)
203
+ model.load_state_dict(state, strict=False)
204
 
205
  trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
206
 
207
 
208
+ @hydra.main(version_base=None, config_path='configs', config_name='config')
209
  def main(config):
210
  """
211
  Main entry point for training
212
  """
 
213
  L.seed_everything(config.seed)
214
 
215
  # print_config(config, resolve=True, save_cfg=True)
216
 
217
  logger = utils.get_logger(__name__)
218
  # load PeptideCLM tokenizer
219
+ tok_dir = config.paths.tokenizers
220
+ if config.vocab == 'new_smiles':
221
  tokenizer = APETokenizer()
222
+ tokenizer.load_vocabulary(f'{tok_dir}/peptide_smiles_600_vocab.json')
223
  elif config.vocab == 'old_smiles':
224
+ tokenizer = SMILES_SPE_Tokenizer(f'{tok_dir}/new_vocab.txt',
225
+ f'{tok_dir}/new_splits.txt')
226
  elif config.vocab == 'selfies':
227
  tokenizer = APETokenizer()
228
+ tokenizer.load_vocabulary(f'{tok_dir}/peptide_selfies_600_vocab.json')
229
  elif config.vocab == 'helm':
230
+ tokenizer = HelmTokenizer(f'{tok_dir}/monomer_vocab.txt')
231
 
232
  if config.backbone == 'finetune_roformer':
233
  train_dataset = load_dataset('csv', data_files=config.data.train)
 
237
  val_dataset = val_dataset['train']#.select(lst)
238
  data_module = dataloader.CustomDataModule(train_dataset, val_dataset, None, tokenizer, batch_size=config.loader.global_batch_size)
239
  else:
240
+ data_module = dynamic_dataloader.CustomDataModule(f'{config.paths.data}/smiles/11M_smiles_old_tokenizer_no_limit', tokenizer)
241
 
242
  if config.mode == 'sample_eval':
243
  generate_samples(config, logger, tokenizer)
 
248
 
249
 
250
  if __name__ == '__main__':
251
+ main()
peptune-pretrained.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b259f022c21121f5c755fed61230d6fdf2626ee4ab8a23df479b3cf553fd4aef
3
+ size 1386966244