sanchit-gandhi commited on
Commit
6f099a2
·
verified ·
1 Parent(s): 11169aa

Model save

Browse files
README.md CHANGED
@@ -1,16 +1,12 @@
1
  ---
2
  base_model: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
3
  tags:
4
- - alignment-handbook
5
- - trl
6
- - sft
7
- - generated_from_trainer
8
  - trl
9
  - sft
10
  - alignment-handbook
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceH4/ultrachat_200k
14
  model-index:
15
  - name: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
16
  results: []
@@ -21,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
23
 
24
- This model is a fine-tuned version of [sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat](https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat) on the HuggingFaceH4/ultrachat_200k dataset.
25
  It achieves the following results on the evaluation set:
26
  - Loss: 1.1555
27
 
 
1
  ---
2
  base_model: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
3
  tags:
 
 
 
 
4
  - trl
5
  - sft
6
  - alignment-handbook
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
12
  results: []
 
17
 
18
  # sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
19
 
20
+ This model is a fine-tuned version of [sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat](https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: 1.1555
23
 
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 1.0,
3
  "eval_loss": 1.1554738283157349,
4
- "eval_runtime": 30.632,
5
  "eval_samples": 23110,
6
- "eval_samples_per_second": 503.754,
7
- "eval_steps_per_second": 1.991,
8
  "total_flos": 2.468335550600315e+18,
9
  "train_loss": 0.0,
10
- "train_runtime": 4.2964,
11
  "train_samples": 207865,
12
- "train_samples_per_second": 32452.086,
13
- "train_steps_per_second": 126.851
14
  }
 
1
  {
2
  "epoch": 1.0,
3
  "eval_loss": 1.1554738283157349,
4
+ "eval_runtime": 30.6475,
5
  "eval_samples": 23110,
6
+ "eval_samples_per_second": 503.5,
7
+ "eval_steps_per_second": 1.99,
8
  "total_flos": 2.468335550600315e+18,
9
  "train_loss": 0.0,
10
+ "train_runtime": 4.301,
11
  "train_samples": 207865,
12
+ "train_samples_per_second": 32416.867,
13
+ "train_steps_per_second": 126.714
14
  }
config.json CHANGED
@@ -21,6 +21,6 @@
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.40.1",
24
- "use_cache": true,
25
  "vocab_size": 32000
26
  }
 
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.40.1",
24
+ "use_cache": false,
25
  "vocab_size": 32000
26
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
  "eval_loss": 1.1554738283157349,
4
- "eval_runtime": 30.632,
5
  "eval_samples": 23110,
6
- "eval_samples_per_second": 503.754,
7
- "eval_steps_per_second": 1.991
8
  }
 
1
  {
2
  "epoch": 1.0,
3
  "eval_loss": 1.1554738283157349,
4
+ "eval_runtime": 30.6475,
5
  "eval_samples": 23110,
6
+ "eval_samples_per_second": 503.5,
7
+ "eval_steps_per_second": 1.99
8
  }
runs/Apr26_14-34-38_ip-26-0-165-24/events.out.tfevents.1714142100.ip-26-0-165-24.907122.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa81ccdfa0d148968e32fb896c259f16c44dbbaf199ecb3a6ca608724245859e
3
+ size 5063
runs/Apr26_14-34-38_ip-26-0-165-24/events.out.tfevents.1714142135.ip-26-0-165-24.907122.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ba4096e646b9751765e15799f1b5e63d441035c6209d4b900ffb01656169d43
3
+ size 359
train_results.json CHANGED
@@ -2,8 +2,8 @@
2
  "epoch": 1.0,
3
  "total_flos": 2.468335550600315e+18,
4
  "train_loss": 0.0,
5
- "train_runtime": 4.2964,
6
  "train_samples": 207865,
7
- "train_samples_per_second": 32452.086,
8
- "train_steps_per_second": 126.851
9
  }
 
2
  "epoch": 1.0,
3
  "total_flos": 2.468335550600315e+18,
4
  "train_loss": 0.0,
5
+ "train_runtime": 4.301,
6
  "train_samples": 207865,
7
+ "train_samples_per_second": 32416.867,
8
+ "train_steps_per_second": 126.714
9
  }
trainer_state.json CHANGED
@@ -175,9 +175,9 @@
175
  "step": 545,
176
  "total_flos": 2.468335550600315e+18,
177
  "train_loss": 0.0,
178
- "train_runtime": 4.2964,
179
- "train_samples_per_second": 32452.086,
180
- "train_steps_per_second": 126.851
181
  }
182
  ],
183
  "logging_steps": 25,
 
175
  "step": 545,
176
  "total_flos": 2.468335550600315e+18,
177
  "train_loss": 0.0,
178
+ "train_runtime": 4.301,
179
+ "train_samples_per_second": 32416.867,
180
+ "train_steps_per_second": 126.714
181
  }
182
  ],
183
  "logging_steps": 25,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6ddc8fdbd053a452b6ce8c232c1d141e2bd88a2b069ed1b456f8b7444974db8
3
  size 4984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:859d4ecc2ba3e0413696377adfda9140f5dbc7e83b200a76008ae260f5806e07
3
  size 4984
wandb/debug-internal.log CHANGED
@@ -1,120 +1,109 @@
1
- 2024-04-26 14:33:12,514 INFO StreamThr :906599 [internal.py:wandb_internal():86] W&B internal server running at pid: 906599, started at: 2024-04-26 14:33:12.513611
2
- 2024-04-26 14:33:12,516 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status
3
- 2024-04-26 14:33:12,517 INFO WriterThread:906599 [datastore.py:open_for_write():87] open: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/run-xista79n.wandb
4
- 2024-04-26 14:33:12,519 DEBUG SenderThread:906599 [sender.py:send():379] send: header
5
- 2024-04-26 14:33:12,534 DEBUG SenderThread:906599 [sender.py:send():379] send: run
6
- 2024-04-26 14:33:12,759 INFO SenderThread:906599 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files
7
- 2024-04-26 14:33:12,759 INFO SenderThread:906599 [sender.py:_start_run_threads():1124] run started: xista79n with start time 1714141992.51397
8
- 2024-04-26 14:33:12,770 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: check_version
9
- 2024-04-26 14:33:12,770 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: check_version
10
- 2024-04-26 14:33:12,829 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: run_start
11
- 2024-04-26 14:33:12,889 DEBUG HandlerThread:906599 [system_info.py:__init__():26] System info init
12
- 2024-04-26 14:33:12,889 DEBUG HandlerThread:906599 [system_info.py:__init__():41] System info init done
13
- 2024-04-26 14:33:12,889 INFO HandlerThread:906599 [system_monitor.py:start():194] Starting system monitor
14
- 2024-04-26 14:33:12,890 INFO SystemMonitor:906599 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
- 2024-04-26 14:33:12,890 INFO HandlerThread:906599 [system_monitor.py:probe():214] Collecting system info
16
- 2024-04-26 14:33:12,890 INFO SystemMonitor:906599 [interfaces.py:start():190] Started cpu monitoring
17
- 2024-04-26 14:33:12,891 INFO SystemMonitor:906599 [interfaces.py:start():190] Started disk monitoring
18
- 2024-04-26 14:33:12,891 INFO SystemMonitor:906599 [interfaces.py:start():190] Started gpu monitoring
19
- 2024-04-26 14:33:12,892 INFO SystemMonitor:906599 [interfaces.py:start():190] Started memory monitoring
20
- 2024-04-26 14:33:12,892 INFO SystemMonitor:906599 [interfaces.py:start():190] Started network monitoring
21
- 2024-04-26 14:33:12,940 DEBUG HandlerThread:906599 [system_info.py:probe():150] Probing system
22
- 2024-04-26 14:33:12,942 DEBUG HandlerThread:906599 [system_info.py:_probe_git():135] Probing git
23
- 2024-04-26 14:33:12,962 DEBUG HandlerThread:906599 [system_info.py:_probe_git():143] Probing git done
24
- 2024-04-26 14:33:12,962 DEBUG HandlerThread:906599 [system_info.py:probe():198] Probing system done
25
- 2024-04-26 14:33:12,963 DEBUG HandlerThread:906599 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.9', 'heartbeatAt': '2024-04-26T14:33:12.940108', 'startedAt': '2024-04-26T14:33:12.499642', 'docker': None, 'cuda': None, 'args': ('./config_200k.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py', 'codePathLocal': 'run_sft.py', 'codePath': 'run_sft.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k', 'commit': '37a86cac6473b5416859c9fc849f41892a2da4b3'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k', 'host': 'ip-26-0-165-24', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/alignment/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2717.2920208333335, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 3597.589, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2478.429, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.016, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3596.82, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.389, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.218, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3596.825, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3595.748, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 58.32090759277344}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855346679688}}
26
- 2024-04-26 14:33:12,963 INFO HandlerThread:906599 [system_monitor.py:probe():224] Finished collecting system info
27
- 2024-04-26 14:33:12,963 INFO HandlerThread:906599 [system_monitor.py:probe():227] Publishing system info
28
- 2024-04-26 14:33:12,963 DEBUG HandlerThread:906599 [system_info.py:_save_conda():207] Saving list of conda packages installed into the current environment
29
- 2024-04-26 14:33:13,762 INFO Thread-12 :906599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/conda-environment.yaml
30
- 2024-04-26 14:33:15,382 DEBUG HandlerThread:906599 [system_info.py:_save_conda():222] Saving conda packages done
31
- 2024-04-26 14:33:15,384 INFO HandlerThread:906599 [system_monitor.py:probe():229] Finished publishing system info
32
- 2024-04-26 14:33:15,403 DEBUG SenderThread:906599 [sender.py:send():379] send: files
33
- 2024-04-26 14:33:15,403 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-metadata.json with policy now
34
- 2024-04-26 14:33:15,549 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: python_packages
35
- 2024-04-26 14:33:15,549 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: stop_status
36
- 2024-04-26 14:33:15,549 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: python_packages
37
- 2024-04-26 14:33:15,550 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: internal_messages
38
- 2024-04-26 14:33:15,552 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: stop_status
39
- 2024-04-26 14:33:15,598 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: partial_history
40
- 2024-04-26 14:33:15,764 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/conda-environment.yaml
41
- 2024-04-26 14:33:15,764 INFO Thread-12 :906599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/requirements.txt
42
- 2024-04-26 14:33:15,764 INFO Thread-12 :906599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-metadata.json
43
- 2024-04-26 14:33:15,944 DEBUG SenderThread:906599 [sender.py:send():379] send: telemetry
44
- 2024-04-26 14:33:15,944 DEBUG SenderThread:906599 [sender.py:send():379] send: config
45
- 2024-04-26 14:33:15,946 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
46
- 2024-04-26 14:33:15,946 DEBUG SenderThread:906599 [sender.py:send():379] send: telemetry
47
- 2024-04-26 14:33:15,946 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
48
- 2024-04-26 14:33:15,946 WARNING SenderThread:906599 [sender.py:send_metric():1341] Seen metric with glob (shouldn't happen)
49
- 2024-04-26 14:33:15,946 DEBUG SenderThread:906599 [sender.py:send():379] send: telemetry
50
- 2024-04-26 14:33:15,948 DEBUG SenderThread:906599 [sender.py:send():379] send: telemetry
51
- 2024-04-26 14:33:15,948 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
52
- 2024-04-26 14:33:15,950 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
53
- 2024-04-26 14:33:15,950 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
54
- 2024-04-26 14:33:15,952 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
55
- 2024-04-26 14:33:15,952 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
56
- 2024-04-26 14:33:15,953 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
57
- 2024-04-26 14:33:15,953 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
58
- 2024-04-26 14:33:15,954 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
59
- 2024-04-26 14:33:15,954 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
60
- 2024-04-26 14:33:15,956 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
61
- 2024-04-26 14:33:15,956 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
62
- 2024-04-26 14:33:15,956 DEBUG SenderThread:906599 [sender.py:send():379] send: history
63
- 2024-04-26 14:33:15,956 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
64
- 2024-04-26 14:33:15,957 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
65
- 2024-04-26 14:33:16,765 INFO Thread-12 :906599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-summary.json
66
- 2024-04-26 14:33:16,766 INFO Thread-12 :906599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
67
- 2024-04-26 14:33:16,842 INFO wandb-upload_0:906599 [upload_job.py:push():131] Uploaded file /tmp/tmp75aqsphnwandb/nq9ihh9s-wandb-metadata.json
68
- 2024-04-26 14:33:17,595 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
69
- 2024-04-26 14:33:18,767 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
70
- 2024-04-26 14:33:20,769 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
71
- 2024-04-26 14:33:22,771 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
72
- 2024-04-26 14:33:23,025 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
73
- 2024-04-26 14:33:24,773 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
74
- 2024-04-26 14:33:26,775 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
75
- 2024-04-26 14:33:28,477 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
76
- 2024-04-26 14:33:28,777 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
77
- 2024-04-26 14:33:30,549 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: stop_status
78
- 2024-04-26 14:33:30,550 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: stop_status
79
- 2024-04-26 14:33:30,551 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: internal_messages
80
- 2024-04-26 14:33:30,779 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
81
- 2024-04-26 14:33:32,781 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
82
- 2024-04-26 14:33:33,940 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
83
- 2024-04-26 14:33:34,783 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
84
- 2024-04-26 14:33:36,785 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
85
- 2024-04-26 14:33:38,787 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
86
- 2024-04-26 14:33:39,400 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
87
- 2024-04-26 14:33:40,789 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
88
- 2024-04-26 14:33:42,790 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
89
- 2024-04-26 14:33:44,793 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
90
- 2024-04-26 14:33:44,863 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
91
- 2024-04-26 14:33:45,549 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: stop_status
92
- 2024-04-26 14:33:45,550 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: stop_status
93
- 2024-04-26 14:33:45,551 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: internal_messages
94
- 2024-04-26 14:33:45,794 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/config.yaml
95
- 2024-04-26 14:33:46,263 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: partial_history
96
- 2024-04-26 14:33:46,264 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
97
- 2024-04-26 14:33:46,265 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
98
- 2024-04-26 14:33:46,266 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
99
- 2024-04-26 14:33:46,266 DEBUG SenderThread:906599 [sender.py:send():379] send: metric
100
- 2024-04-26 14:33:46,267 DEBUG SenderThread:906599 [sender.py:send():379] send: history
101
- 2024-04-26 14:33:46,267 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: summary_record
102
- 2024-04-26 14:33:46,268 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
103
- 2024-04-26 14:33:46,796 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-summary.json
104
- 2024-04-26 14:33:46,797 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
105
- 2024-04-26 14:33:48,798 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
106
- 2024-04-26 14:33:50,273 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
107
- 2024-04-26 14:33:54,805 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
108
- 2024-04-26 14:33:55,947 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
109
- 2024-04-26 14:33:58,809 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
110
- 2024-04-26 14:34:00,550 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: stop_status
111
- 2024-04-26 14:34:00,550 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: stop_status
112
- 2024-04-26 14:34:00,552 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: internal_messages
113
- 2024-04-26 14:34:00,811 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
114
- 2024-04-26 14:34:01,641 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
115
- 2024-04-26 14:34:02,813 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
116
- 2024-04-26 14:34:06,644 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
117
- 2024-04-26 14:34:08,821 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
118
- 2024-04-26 14:34:11,761 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
119
- 2024-04-26 14:34:12,893 DEBUG SystemMonitor:906599 [system_monitor.py:_start():172] Starting system metrics aggregation loop
120
- 2024-04-26 14:34:12,907 DEBUG SenderThread:906599 [sender.py:send():379] send: stats
 
1
+ 2024-04-26 14:35:01,974 INFO StreamThr :907589 [internal.py:wandb_internal():86] W&B internal server running at pid: 907589, started at: 2024-04-26 14:35:01.973422
2
+ 2024-04-26 14:35:01,976 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status
3
+ 2024-04-26 14:35:01,977 INFO WriterThread:907589 [datastore.py:open_for_write():87] open: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/run-slqyh8h3.wandb
4
+ 2024-04-26 14:35:01,980 DEBUG SenderThread:907589 [sender.py:send():379] send: header
5
+ 2024-04-26 14:35:01,994 DEBUG SenderThread:907589 [sender.py:send():379] send: run
6
+ 2024-04-26 14:35:02,153 INFO SenderThread:907589 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files
7
+ 2024-04-26 14:35:02,153 INFO SenderThread:907589 [sender.py:_start_run_threads():1124] run started: slqyh8h3 with start time 1714142101.973245
8
+ 2024-04-26 14:35:02,163 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-04-26 14:35:02,164 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: check_version
10
+ 2024-04-26 14:35:02,222 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-04-26 14:35:02,282 DEBUG HandlerThread:907589 [system_info.py:__init__():26] System info init
12
+ 2024-04-26 14:35:02,282 DEBUG HandlerThread:907589 [system_info.py:__init__():41] System info init done
13
+ 2024-04-26 14:35:02,282 INFO HandlerThread:907589 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-04-26 14:35:02,282 INFO SystemMonitor:907589 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-04-26 14:35:02,282 INFO HandlerThread:907589 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-04-26 14:35:02,283 INFO SystemMonitor:907589 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-04-26 14:35:02,283 INFO SystemMonitor:907589 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-04-26 14:35:02,284 INFO SystemMonitor:907589 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-04-26 14:35:02,284 INFO SystemMonitor:907589 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-04-26 14:35:02,285 INFO SystemMonitor:907589 [interfaces.py:start():190] Started network monitoring
21
+ 2024-04-26 14:35:02,331 DEBUG HandlerThread:907589 [system_info.py:probe():150] Probing system
22
+ 2024-04-26 14:35:02,334 DEBUG HandlerThread:907589 [system_info.py:_probe_git():135] Probing git
23
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_info.py:_probe_git():143] Probing git done
24
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_info.py:probe():198] Probing system done
25
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.9', 'heartbeatAt': '2024-04-26T14:35:02.331727', 'startedAt': '2024-04-26T14:35:01.958802', 'docker': None, 'cuda': None, 'args': ('./config_200k.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py', 'codePathLocal': 'run_sft.py', 'codePath': 'run_sft.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k', 'commit': '37a86cac6473b5416859c9fc849f41892a2da4b3'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k', 'host': 'ip-26-0-165-24', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/alignment/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2726.2313124999996, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3369.271, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3567.605, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.267, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.129, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.777, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3593.395, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.941, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3596.821, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 58.3209114074707}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855346679688}}
26
+ 2024-04-26 14:35:02,354 INFO HandlerThread:907589 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-04-26 14:35:02,354 INFO HandlerThread:907589 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_info.py:_save_conda():207] Saving list of conda packages installed into the current environment
29
+ 2024-04-26 14:35:03,156 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/conda-environment.yaml
30
+ 2024-04-26 14:35:04,823 DEBUG HandlerThread:907589 [system_info.py:_save_conda():222] Saving conda packages done
31
+ 2024-04-26 14:35:04,825 INFO HandlerThread:907589 [system_monitor.py:probe():229] Finished publishing system info
32
+ 2024-04-26 14:35:04,843 DEBUG SenderThread:907589 [sender.py:send():379] send: files
33
+ 2024-04-26 14:35:04,843 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-metadata.json with policy now
34
+ 2024-04-26 14:35:04,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: python_packages
35
+ 2024-04-26 14:35:04,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: stop_status
36
+ 2024-04-26 14:35:04,988 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: python_packages
37
+ 2024-04-26 14:35:04,989 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: internal_messages
38
+ 2024-04-26 14:35:04,991 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: stop_status
39
+ 2024-04-26 14:35:05,036 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: partial_history
40
+ 2024-04-26 14:35:05,104 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
41
+ 2024-04-26 14:35:05,105 DEBUG SenderThread:907589 [sender.py:send():379] send: config
42
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
43
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
44
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
45
+ 2024-04-26 14:35:05,106 WARNING SenderThread:907589 [sender.py:send_metric():1341] Seen metric with glob (shouldn't happen)
46
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
47
+ 2024-04-26 14:35:05,107 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
48
+ 2024-04-26 14:35:05,107 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
49
+ 2024-04-26 14:35:05,110 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
50
+ 2024-04-26 14:35:05,110 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
51
+ 2024-04-26 14:35:05,111 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
52
+ 2024-04-26 14:35:05,111 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
53
+ 2024-04-26 14:35:05,112 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
54
+ 2024-04-26 14:35:05,112 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
55
+ 2024-04-26 14:35:05,113 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
56
+ 2024-04-26 14:35:05,114 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
57
+ 2024-04-26 14:35:05,115 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
58
+ 2024-04-26 14:35:05,115 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
59
+ 2024-04-26 14:35:05,115 DEBUG SenderThread:907589 [sender.py:send():379] send: history
60
+ 2024-04-26 14:35:05,115 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
61
+ 2024-04-26 14:35:05,116 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
62
+ 2024-04-26 14:35:05,148 INFO wandb-upload_0:907589 [upload_job.py:push():131] Uploaded file /tmp/tmpjmlr9sepwandb/cfal2aup-wandb-metadata.json
63
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/conda-environment.yaml
64
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/wandb-metadata.json
65
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/requirements.txt
66
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/wandb-summary.json
67
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
68
+ 2024-04-26 14:35:07,030 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
69
+ 2024-04-26 14:35:07,160 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
70
+ 2024-04-26 14:35:09,162 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
71
+ 2024-04-26 14:35:11,164 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
72
+ 2024-04-26 14:35:12,475 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
73
+ 2024-04-26 14:35:13,166 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
74
+ 2024-04-26 14:35:15,168 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
75
+ 2024-04-26 14:35:17,170 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
76
+ 2024-04-26 14:35:17,923 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
77
+ 2024-04-26 14:35:19,172 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
78
+ 2024-04-26 14:35:19,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: stop_status
79
+ 2024-04-26 14:35:19,989 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: stop_status
80
+ 2024-04-26 14:35:19,990 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: internal_messages
81
+ 2024-04-26 14:35:21,174 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
82
+ 2024-04-26 14:35:23,176 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
83
+ 2024-04-26 14:35:23,382 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
84
+ 2024-04-26 14:35:25,177 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
85
+ 2024-04-26 14:35:27,179 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
86
+ 2024-04-26 14:35:28,842 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
87
+ 2024-04-26 14:35:29,181 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
88
+ 2024-04-26 14:35:31,183 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
89
+ 2024-04-26 14:35:33,185 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
90
+ 2024-04-26 14:35:34,309 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
91
+ 2024-04-26 14:35:34,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: stop_status
92
+ 2024-04-26 14:35:34,989 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: stop_status
93
+ 2024-04-26 14:35:34,990 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: internal_messages
94
+ 2024-04-26 14:35:35,188 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/config.yaml
95
+ 2024-04-26 14:35:35,188 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
96
+ 2024-04-26 14:35:35,714 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: partial_history
97
+ 2024-04-26 14:35:35,715 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
98
+ 2024-04-26 14:35:35,716 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
99
+ 2024-04-26 14:35:35,717 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
100
+ 2024-04-26 14:35:35,717 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
101
+ 2024-04-26 14:35:35,718 DEBUG SenderThread:907589 [sender.py:send():379] send: history
102
+ 2024-04-26 14:35:35,718 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
103
+ 2024-04-26 14:35:35,720 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
104
+ 2024-04-26 14:35:36,191 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/wandb-summary.json
105
+ 2024-04-26 14:35:37,192 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
106
+ 2024-04-26 14:35:39,724 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-04-26 14:35:43,200 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
108
+ 2024-04-26 14:35:45,005 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
109
+ 2024-04-26 14:35:47,205 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
 
 
 
 
 
 
 
 
 
 
 
wandb/debug.log CHANGED
@@ -1,28 +1,28 @@
1
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Current SDK version is 0.16.6
2
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Configure stats pid to 906132
3
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/settings
5
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py'}
8
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_setup.py:_flush():76] Applying login settings: {}
9
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_init.py:_log_setup():521] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/logs/debug.log
10
- 2024-04-26 14:33:12,508 INFO MainThread:906132 [wandb_init.py:_log_setup():522] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/logs/debug-internal.log
11
- 2024-04-26 14:33:12,509 INFO MainThread:906132 [wandb_init.py:init():561] calling init triggers
12
- 2024-04-26 14:33:12,509 INFO MainThread:906132 [wandb_init.py:init():568] wandb.init called with sweep_config: {}
13
  config: {}
14
- 2024-04-26 14:33:12,509 INFO MainThread:906132 [wandb_init.py:init():611] starting backend
15
- 2024-04-26 14:33:12,509 INFO MainThread:906132 [wandb_init.py:init():615] setting up manager
16
- 2024-04-26 14:33:12,511 INFO MainThread:906132 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
- 2024-04-26 14:33:12,513 INFO MainThread:906132 [wandb_init.py:init():623] backend started and connected
18
- 2024-04-26 14:33:12,516 INFO MainThread:906132 [wandb_init.py:init():715] updated telemetry
19
- 2024-04-26 14:33:12,533 INFO MainThread:906132 [wandb_init.py:init():748] communicating run to backend with 90.0 second timeout
20
- 2024-04-26 14:33:12,769 INFO MainThread:906132 [wandb_run.py:_on_init():2357] communicating current version
21
- 2024-04-26 14:33:12,820 INFO MainThread:906132 [wandb_run.py:_on_init():2366] got version response
22
- 2024-04-26 14:33:12,821 INFO MainThread:906132 [wandb_init.py:init():799] starting run threads in backend
23
- 2024-04-26 14:33:15,550 INFO MainThread:906132 [wandb_run.py:_console_start():2335] atexit reg
24
- 2024-04-26 14:33:15,550 INFO MainThread:906132 [wandb_run.py:_redirect():2190] redirect: wrap_raw
25
- 2024-04-26 14:33:15,550 INFO MainThread:906132 [wandb_run.py:_redirect():2255] Wrapping output streams.
26
- 2024-04-26 14:33:15,550 INFO MainThread:906132 [wandb_run.py:_redirect():2280] Redirects installed.
27
- 2024-04-26 14:33:15,551 INFO MainThread:906132 [wandb_init.py:init():842] run started, returning control to user process
28
- 2024-04-26 14:33:15,552 INFO MainThread:906132 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_14-32-47_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
 
1
+ 2024-04-26 14:35:01,967 INFO MainThread:907122 [wandb_setup.py:_flush():76] Current SDK version is 0.16.6
2
+ 2024-04-26 14:35:01,967 INFO MainThread:907122 [wandb_setup.py:_flush():76] Configure stats pid to 907122
3
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/settings
5
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py'}
8
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:_log_setup():521] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/logs/debug.log
10
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:_log_setup():522] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/logs/debug-internal.log
11
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():561] calling init triggers
12
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():568] wandb.init called with sweep_config: {}
13
  config: {}
14
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():611] starting backend
15
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():615] setting up manager
16
+ 2024-04-26 14:35:01,971 INFO MainThread:907122 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-04-26 14:35:01,972 INFO MainThread:907122 [wandb_init.py:init():623] backend started and connected
18
+ 2024-04-26 14:35:01,975 INFO MainThread:907122 [wandb_init.py:init():715] updated telemetry
19
+ 2024-04-26 14:35:01,992 INFO MainThread:907122 [wandb_init.py:init():748] communicating run to backend with 90.0 second timeout
20
+ 2024-04-26 14:35:02,163 INFO MainThread:907122 [wandb_run.py:_on_init():2357] communicating current version
21
+ 2024-04-26 14:35:02,214 INFO MainThread:907122 [wandb_run.py:_on_init():2366] got version response
22
+ 2024-04-26 14:35:02,214 INFO MainThread:907122 [wandb_init.py:init():799] starting run threads in backend
23
+ 2024-04-26 14:35:04,988 INFO MainThread:907122 [wandb_run.py:_console_start():2335] atexit reg
24
+ 2024-04-26 14:35:04,988 INFO MainThread:907122 [wandb_run.py:_redirect():2190] redirect: wrap_raw
25
+ 2024-04-26 14:35:04,988 INFO MainThread:907122 [wandb_run.py:_redirect():2255] Wrapping output streams.
26
+ 2024-04-26 14:35:04,989 INFO MainThread:907122 [wandb_run.py:_redirect():2280] Redirects installed.
27
+ 2024-04-26 14:35:04,990 INFO MainThread:907122 [wandb_init.py:init():842] run started, returning control to user process
28
+ 2024-04-26 14:35:04,991 INFO MainThread:907122 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_14-34-38_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
wandb/run-20240426_143312-xista79n/files/config.yaml CHANGED
@@ -49,6 +49,22 @@ _wandb:
49
  5: 1
50
  6:
51
  - 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  vocab_size:
53
  desc: null
54
  value: 32000
 
49
  5: 1
50
  6:
51
  - 1
52
+ - 1: eval/loss
53
+ 5: 1
54
+ 6:
55
+ - 1
56
+ - 1: eval/runtime
57
+ 5: 1
58
+ 6:
59
+ - 1
60
+ - 1: eval/samples_per_second
61
+ 5: 1
62
+ 6:
63
+ - 1
64
+ - 1: eval/steps_per_second
65
+ 5: 1
66
+ 6:
67
+ - 1
68
  vocab_size:
69
  desc: null
70
  value: 32000
wandb/run-20240426_143312-xista79n/files/output.log CHANGED
@@ -67,3 +67,5 @@ Upload 3 LFS files: 100%|██████████| 3/3 [00:00<00:00, 6.66
67
  [INFO|tokenization_utils_base.py:2488] 2024-04-26 14:34:12,140 >> tokenizer config file saved in ./tokenizer_config.json
68
  [INFO|tokenization_utils_base.py:2497] 2024-04-26 14:34:12,142 >> Special tokens file saved in ./special_tokens_map.json
69
  [INFO|modelcard.py:450] 2024-04-26 14:34:12,190 >> Dropping the following result as it does not have all the necessary fields:
 
 
 
67
  [INFO|tokenization_utils_base.py:2488] 2024-04-26 14:34:12,140 >> tokenizer config file saved in ./tokenizer_config.json
68
  [INFO|tokenization_utils_base.py:2497] 2024-04-26 14:34:12,142 >> Special tokens file saved in ./special_tokens_map.json
69
  [INFO|modelcard.py:450] 2024-04-26 14:34:12,190 >> Dropping the following result as it does not have all the necessary fields:
70
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'dataset': {'name': 'HuggingFaceH4/ultrachat_200k', 'type': 'HuggingFaceH4/ultrachat_200k', 'config': 'default', 'split': 'train', 'args': 'default'}}
71
+ 2024-04-26 14:34:15 - INFO - __main__ - *** Training complete ***
wandb/run-20240426_143312-xista79n/files/wandb-summary.json CHANGED
@@ -1 +1 @@
1
- {"train_runtime": 4.2964, "train_samples_per_second": 32452.086, "train_steps_per_second": 126.851, "total_flos": 2.468335550600315e+18, "train_loss": 0.0, "train/epoch": 1.0, "train/global_step": 545, "_timestamp": 1714142026.2625933, "_runtime": 33.74862337112427, "_step": 1, "eval/loss": 1.1554738283157349, "eval/runtime": 30.632, "eval/samples_per_second": 503.754, "eval/steps_per_second": 1.991}
 
1
+ {"train_runtime": 4.2964, "train_samples_per_second": 32452.086, "train_steps_per_second": 126.851, "total_flos": 2.468335550600315e+18, "train_loss": 0.0, "train/epoch": 1.0, "train/global_step": 545, "_timestamp": 1714142026.2625933, "_runtime": 33.74862337112427, "_step": 1, "eval/loss": 1.1554738283157349, "eval/runtime": 30.632, "eval/samples_per_second": 503.754, "eval/steps_per_second": 1.991, "_wandb": {"runtime": 62}}
wandb/run-20240426_143312-xista79n/logs/debug-internal.log CHANGED
@@ -118,3 +118,149 @@
118
  2024-04-26 14:34:11,761 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
119
  2024-04-26 14:34:12,893 DEBUG SystemMonitor:906599 [system_monitor.py:_start():172] Starting system metrics aggregation loop
120
  2024-04-26 14:34:12,907 DEBUG SenderThread:906599 [sender.py:send():379] send: stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  2024-04-26 14:34:11,761 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
119
  2024-04-26 14:34:12,893 DEBUG SystemMonitor:906599 [system_monitor.py:_start():172] Starting system metrics aggregation loop
120
  2024-04-26 14:34:12,907 DEBUG SenderThread:906599 [sender.py:send():379] send: stats
121
+ 2024-04-26 14:34:14,827 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
122
+ 2024-04-26 14:34:15,550 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: stop_status
123
+ 2024-04-26 14:34:15,551 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: stop_status
124
+ 2024-04-26 14:34:15,553 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: internal_messages
125
+ 2024-04-26 14:34:15,649 DEBUG SenderThread:906599 [sender.py:send():379] send: exit
126
+ 2024-04-26 14:34:15,649 INFO SenderThread:906599 [sender.py:send_exit():586] handling exit code: 0
127
+ 2024-04-26 14:34:15,649 INFO SenderThread:906599 [sender.py:send_exit():588] handling runtime: 62
128
+ 2024-04-26 14:34:15,651 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
129
+ 2024-04-26 14:34:15,651 INFO SenderThread:906599 [sender.py:send_exit():594] send defer
130
+ 2024-04-26 14:34:15,651 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
131
+ 2024-04-26 14:34:15,651 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 0
132
+ 2024-04-26 14:34:15,651 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
133
+ 2024-04-26 14:34:15,651 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 0
134
+ 2024-04-26 14:34:15,651 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 1
135
+ 2024-04-26 14:34:15,651 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
136
+ 2024-04-26 14:34:15,652 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 1
137
+ 2024-04-26 14:34:15,652 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
138
+ 2024-04-26 14:34:15,652 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 1
139
+ 2024-04-26 14:34:15,652 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 2
140
+ 2024-04-26 14:34:15,652 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
141
+ 2024-04-26 14:34:15,652 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 2
142
+ 2024-04-26 14:34:15,652 INFO HandlerThread:906599 [system_monitor.py:finish():203] Stopping system monitor
143
+ 2024-04-26 14:34:15,652 DEBUG SystemMonitor:906599 [system_monitor.py:_start():179] Finished system metrics aggregation loop
144
+ 2024-04-26 14:34:15,653 DEBUG SystemMonitor:906599 [system_monitor.py:_start():183] Publishing last batch of metrics
145
+ 2024-04-26 14:34:15,653 INFO HandlerThread:906599 [interfaces.py:finish():202] Joined cpu monitor
146
+ 2024-04-26 14:34:15,654 INFO HandlerThread:906599 [interfaces.py:finish():202] Joined disk monitor
147
+ 2024-04-26 14:34:15,692 INFO HandlerThread:906599 [interfaces.py:finish():202] Joined gpu monitor
148
+ 2024-04-26 14:34:15,692 INFO HandlerThread:906599 [interfaces.py:finish():202] Joined memory monitor
149
+ 2024-04-26 14:34:15,692 INFO HandlerThread:906599 [interfaces.py:finish():202] Joined network monitor
150
+ 2024-04-26 14:34:15,693 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
151
+ 2024-04-26 14:34:15,693 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 2
152
+ 2024-04-26 14:34:15,693 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 3
153
+ 2024-04-26 14:34:15,693 DEBUG SenderThread:906599 [sender.py:send():379] send: stats
154
+ 2024-04-26 14:34:15,693 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
155
+ 2024-04-26 14:34:15,694 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 3
156
+ 2024-04-26 14:34:15,694 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
157
+ 2024-04-26 14:34:15,694 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 3
158
+ 2024-04-26 14:34:15,694 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 4
159
+ 2024-04-26 14:34:15,694 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
160
+ 2024-04-26 14:34:15,694 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 4
161
+ 2024-04-26 14:34:15,694 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
162
+ 2024-04-26 14:34:15,694 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 4
163
+ 2024-04-26 14:34:15,694 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 5
164
+ 2024-04-26 14:34:15,694 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
165
+ 2024-04-26 14:34:15,694 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 5
166
+ 2024-04-26 14:34:15,695 DEBUG SenderThread:906599 [sender.py:send():379] send: summary
167
+ 2024-04-26 14:34:15,696 INFO SenderThread:906599 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
168
+ 2024-04-26 14:34:15,696 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
169
+ 2024-04-26 14:34:15,696 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 5
170
+ 2024-04-26 14:34:15,696 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 6
171
+ 2024-04-26 14:34:15,696 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
172
+ 2024-04-26 14:34:15,696 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 6
173
+ 2024-04-26 14:34:15,696 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
174
+ 2024-04-26 14:34:15,697 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 6
175
+ 2024-04-26 14:34:15,699 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: status_report
176
+ 2024-04-26 14:34:15,829 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-summary.json
177
+ 2024-04-26 14:34:15,879 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 7
178
+ 2024-04-26 14:34:15,879 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
179
+ 2024-04-26 14:34:15,879 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 7
180
+ 2024-04-26 14:34:15,879 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
181
+ 2024-04-26 14:34:15,879 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 7
182
+ 2024-04-26 14:34:16,650 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: poll_exit
183
+ 2024-04-26 14:34:16,831 INFO Thread-12 :906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/config.yaml
184
+ 2024-04-26 14:34:17,971 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 8
185
+ 2024-04-26 14:34:17,972 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: poll_exit
186
+ 2024-04-26 14:34:17,972 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
187
+ 2024-04-26 14:34:17,972 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 8
188
+ 2024-04-26 14:34:17,972 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
189
+ 2024-04-26 14:34:17,972 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 8
190
+ 2024-04-26 14:34:17,972 INFO SenderThread:906599 [job_builder.py:build():318] Attempting to build job artifact
191
+ 2024-04-26 14:34:17,973 INFO SenderThread:906599 [job_builder.py:_get_source_type():455] is repo sourced job
192
+ 2024-04-26 14:34:18,007 INFO SenderThread:906599 [job_builder.py:build():431] adding wandb-job metadata file
193
+ 2024-04-26 14:34:18,017 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 9
194
+ 2024-04-26 14:34:18,017 DEBUG SenderThread:906599 [sender.py:send():379] send: artifact
195
+ 2024-04-26 14:34:18,017 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
196
+ 2024-04-26 14:34:18,018 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 9
197
+ 2024-04-26 14:34:18,295 INFO SenderThread:906599 [sender.py:send_artifact():1468] sent artifact job-https___huggingface.co_sanchit-gandhi_distil-zephyr-1.5b-ssft-ultrachat-200k_run_sft.py - {'id': 'QXJ0aWZhY3Q6ODExNzUzODM5', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjE2NzA4NDIwMg==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6ODExNzUzODM5', 'versionIndex': 1}}}
198
+ 2024-04-26 14:34:18,295 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
199
+ 2024-04-26 14:34:18,296 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 9
200
+ 2024-04-26 14:34:18,296 INFO SenderThread:906599 [dir_watcher.py:finish():358] shutting down directory watcher
201
+ 2024-04-26 14:34:18,651 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: poll_exit
202
+ 2024-04-26 14:34:18,833 INFO SenderThread:906599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
203
+ 2024-04-26 14:34:18,834 INFO SenderThread:906599 [dir_watcher.py:finish():388] scan: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files
204
+ 2024-04-26 14:34:18,835 INFO SenderThread:906599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/conda-environment.yaml conda-environment.yaml
205
+ 2024-04-26 14:34:18,835 INFO SenderThread:906599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log output.log
206
+ 2024-04-26 14:34:18,835 INFO SenderThread:906599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-metadata.json wandb-metadata.json
207
+ 2024-04-26 14:34:18,835 INFO SenderThread:906599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/config.yaml config.yaml
208
+ 2024-04-26 14:34:18,840 INFO SenderThread:906599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-summary.json wandb-summary.json
209
+ 2024-04-26 14:34:18,842 INFO SenderThread:906599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/requirements.txt requirements.txt
210
+ 2024-04-26 14:34:18,842 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 10
211
+ 2024-04-26 14:34:18,842 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: poll_exit
212
+ 2024-04-26 14:34:18,843 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
213
+ 2024-04-26 14:34:18,844 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 10
214
+ 2024-04-26 14:34:18,846 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
215
+ 2024-04-26 14:34:18,846 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 10
216
+ 2024-04-26 14:34:18,846 INFO SenderThread:906599 [file_pusher.py:finish():172] shutting down file pusher
217
+ 2024-04-26 14:34:19,015 INFO wandb-upload_0:906599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/conda-environment.yaml
218
+ 2024-04-26 14:34:19,096 INFO wandb-upload_1:906599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/output.log
219
+ 2024-04-26 14:34:19,149 INFO wandb-upload_4:906599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/requirements.txt
220
+ 2024-04-26 14:34:19,179 INFO wandb-upload_2:906599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/config.yaml
221
+ 2024-04-26 14:34:19,183 INFO wandb-upload_3:906599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/files/wandb-summary.json
222
+ 2024-04-26 14:34:19,383 INFO Thread-11 (_thread_body):906599 [sender.py:transition_state():614] send defer: 11
223
+ 2024-04-26 14:34:19,383 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
224
+ 2024-04-26 14:34:19,383 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 11
225
+ 2024-04-26 14:34:19,384 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
226
+ 2024-04-26 14:34:19,384 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 11
227
+ 2024-04-26 14:34:19,384 INFO SenderThread:906599 [file_pusher.py:join():178] waiting for file pusher
228
+ 2024-04-26 14:34:19,384 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 12
229
+ 2024-04-26 14:34:19,384 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
230
+ 2024-04-26 14:34:19,384 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 12
231
+ 2024-04-26 14:34:19,384 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
232
+ 2024-04-26 14:34:19,385 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 12
233
+ 2024-04-26 14:34:19,385 INFO SenderThread:906599 [file_stream.py:finish():614] file stream finish called
234
+ 2024-04-26 14:34:19,652 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: poll_exit
235
+ 2024-04-26 14:34:19,660 INFO SenderThread:906599 [file_stream.py:finish():618] file stream finish is done
236
+ 2024-04-26 14:34:19,660 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 13
237
+ 2024-04-26 14:34:19,660 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: poll_exit
238
+ 2024-04-26 14:34:19,660 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
239
+ 2024-04-26 14:34:19,660 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 13
240
+ 2024-04-26 14:34:19,661 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
241
+ 2024-04-26 14:34:19,661 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 13
242
+ 2024-04-26 14:34:19,661 INFO SenderThread:906599 [sender.py:transition_state():614] send defer: 14
243
+ 2024-04-26 14:34:19,661 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: defer
244
+ 2024-04-26 14:34:19,661 DEBUG SenderThread:906599 [sender.py:send():379] send: final
245
+ 2024-04-26 14:34:19,661 INFO HandlerThread:906599 [handler.py:handle_request_defer():172] handle defer: 14
246
+ 2024-04-26 14:34:19,661 DEBUG SenderThread:906599 [sender.py:send():379] send: footer
247
+ 2024-04-26 14:34:19,662 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: defer
248
+ 2024-04-26 14:34:19,662 INFO SenderThread:906599 [sender.py:send_request_defer():610] handle sender defer: 14
249
+ 2024-04-26 14:34:19,662 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: poll_exit
250
+ 2024-04-26 14:34:19,662 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: poll_exit
251
+ 2024-04-26 14:34:19,662 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: poll_exit
252
+ 2024-04-26 14:34:19,663 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: server_info
253
+ 2024-04-26 14:34:19,663 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: poll_exit
254
+ 2024-04-26 14:34:19,663 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: get_summary
255
+ 2024-04-26 14:34:19,663 DEBUG SenderThread:906599 [sender.py:send_request():406] send_request: server_info
256
+ 2024-04-26 14:34:19,664 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: sampled_history
257
+ 2024-04-26 14:34:19,665 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: internal_messages
258
+ 2024-04-26 14:34:19,722 INFO MainThread:906599 [wandb_run.py:_footer_history_summary_info():3936] rendering history
259
+ 2024-04-26 14:34:19,723 INFO MainThread:906599 [wandb_run.py:_footer_history_summary_info():3968] rendering summary
260
+ 2024-04-26 14:34:19,723 INFO MainThread:906599 [wandb_run.py:_footer_sync_info():3895] logging synced files
261
+ 2024-04-26 14:34:19,723 DEBUG HandlerThread:906599 [handler.py:handle_request():146] handle_request: shutdown
262
+ 2024-04-26 14:34:19,723 INFO HandlerThread:906599 [handler.py:finish():866] shutting down handler
263
+ 2024-04-26 14:34:20,663 INFO WriterThread:906599 [datastore.py:close():296] close: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143312-xista79n/run-xista79n.wandb
264
+ 2024-04-26 14:34:20,722 INFO SenderThread:906599 [sender.py:finish():1546] shutting down sender
265
+ 2024-04-26 14:34:20,722 INFO SenderThread:906599 [file_pusher.py:finish():172] shutting down file pusher
266
+ 2024-04-26 14:34:20,722 INFO SenderThread:906599 [file_pusher.py:join():178] waiting for file pusher
wandb/run-20240426_143312-xista79n/logs/debug.log CHANGED
@@ -26,3 +26,4 @@ config: {}
26
  2024-04-26 14:33:15,550 INFO MainThread:906132 [wandb_run.py:_redirect():2280] Redirects installed.
27
  2024-04-26 14:33:15,551 INFO MainThread:906132 [wandb_init.py:init():842] run started, returning control to user process
28
  2024-04-26 14:33:15,552 INFO MainThread:906132 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_14-32-47_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
 
 
26
  2024-04-26 14:33:15,550 INFO MainThread:906132 [wandb_run.py:_redirect():2280] Redirects installed.
27
  2024-04-26 14:33:15,551 INFO MainThread:906132 [wandb_init.py:init():842] run started, returning control to user process
28
  2024-04-26 14:33:15,552 INFO MainThread:906132 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_14-32-47_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
29
+ 2024-04-26 14:34:20,724 WARNING MsgRouterThr:906132 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240426_143312-xista79n/run-xista79n.wandb CHANGED
Binary files a/wandb/run-20240426_143312-xista79n/run-xista79n.wandb and b/wandb/run-20240426_143312-xista79n/run-xista79n.wandb differ
 
wandb/run-20240426_143501-slqyh8h3/files/conda-environment.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: alignment
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - bzip2=1.0.8=h5eee18b_5
8
+ - ca-certificates=2024.3.11=h06a4308_0
9
+ - ld_impl_linux-64=2.38=h1181459_1
10
+ - libffi=3.4.4=h6a678d5_0
11
+ - libgcc-ng=11.2.0=h1234567_1
12
+ - libgomp=11.2.0=h1234567_1
13
+ - libstdcxx-ng=11.2.0=h1234567_1
14
+ - libuuid=1.41.5=h5eee18b_0
15
+ - ncurses=6.4=h6a678d5_0
16
+ - openssl=3.0.13=h7f8727e_0
17
+ - pip=23.3.1=py311h06a4308_0
18
+ - python=3.11.9=h955ad1f_0
19
+ - readline=8.2=h5eee18b_0
20
+ - setuptools=68.2.2=py311h06a4308_0
21
+ - sqlite=3.41.2=h5eee18b_0
22
+ - tk=8.6.12=h1ccaba5_0
23
+ - wheel=0.41.2=py311h06a4308_0
24
+ - xz=5.4.6=h5eee18b_0
25
+ - zlib=1.2.13=h5eee18b_0
26
+ - pip:
27
+ - absl-py==2.1.0
28
+ - accelerate==0.29.3
29
+ - aiohttp==3.9.5
30
+ - aiosignal==1.3.1
31
+ - annotated-types==0.6.0
32
+ - appdirs==1.4.4
33
+ - attrs==23.2.0
34
+ - bitsandbytes==0.43.1
35
+ - certifi==2024.2.2
36
+ - charset-normalizer==3.3.2
37
+ - click==8.1.7
38
+ - datasets==2.19.0
39
+ - deepspeed==0.14.2
40
+ - dill==0.3.8
41
+ - docker-pycreds==0.4.0
42
+ - docstring-parser==0.16
43
+ - einops==0.7.0
44
+ - evaluate==0.4.1
45
+ - filelock==3.13.4
46
+ - frozenlist==1.4.1
47
+ - fsspec==2024.3.1
48
+ - gitdb==4.0.11
49
+ - gitpython==3.1.43
50
+ - grpcio==1.62.2
51
+ - hf-transfer==0.1.6
52
+ - hjson==3.1.0
53
+ - huggingface-hub==0.22.2
54
+ - idna==3.7
55
+ - jinja2==3.1.3
56
+ - markdown==3.6
57
+ - markdown-it-py==3.0.0
58
+ - markupsafe==2.1.5
59
+ - mdurl==0.1.2
60
+ - mpmath==1.3.0
61
+ - multidict==6.0.5
62
+ - multiprocess==0.70.16
63
+ - networkx==3.3
64
+ - ninja==1.11.1.1
65
+ - numpy==1.26.4
66
+ - nvidia-cublas-cu12==12.1.3.1
67
+ - nvidia-cuda-cupti-cu12==12.1.105
68
+ - nvidia-cuda-nvrtc-cu12==12.1.105
69
+ - nvidia-cuda-runtime-cu12==12.1.105
70
+ - nvidia-cudnn-cu12==8.9.2.26
71
+ - nvidia-cufft-cu12==11.0.2.54
72
+ - nvidia-curand-cu12==10.3.2.106
73
+ - nvidia-cusolver-cu12==11.4.5.107
74
+ - nvidia-cusparse-cu12==12.1.0.106
75
+ - nvidia-nccl-cu12==2.19.3
76
+ - nvidia-nvjitlink-cu12==12.4.127
77
+ - nvidia-nvtx-cu12==12.1.105
78
+ - packaging==24.0
79
+ - pandas==2.2.2
80
+ - peft==0.10.0
81
+ - pillow==10.3.0
82
+ - protobuf==3.20.2
83
+ - psutil==5.9.8
84
+ - py-cpuinfo==9.0.0
85
+ - pyarrow==16.0.0
86
+ - pyarrow-hotfix==0.6
87
+ - pydantic==2.7.1
88
+ - pydantic-core==2.18.2
89
+ - pygments==2.17.2
90
+ - pynvml==11.5.0
91
+ - python-dateutil==2.9.0.post0
92
+ - pytz==2024.1
93
+ - pyyaml==6.0.1
94
+ - regex==2024.4.16
95
+ - requests==2.31.0
96
+ - responses==0.18.0
97
+ - rich==13.7.1
98
+ - safetensors==0.4.3
99
+ - scipy==1.13.0
100
+ - sentencepiece==0.2.0
101
+ - sentry-sdk==2.0.0
102
+ - setproctitle==1.3.3
103
+ - shtab==1.7.1
104
+ - six==1.16.0
105
+ - smmap==5.0.1
106
+ - sympy==1.12
107
+ - tensorboard==2.16.2
108
+ - tensorboard-data-server==0.7.2
109
+ - tokenizers==0.19.1
110
+ - torch==2.2.2
111
+ - torchaudio==2.2.2
112
+ - torchvision==0.17.2
113
+ - tqdm==4.66.2
114
+ - transformers==4.40.1
115
+ - triton==2.2.0
116
+ - trl==0.8.6
117
+ - typing-extensions==4.11.0
118
+ - tyro==0.8.3
119
+ - tzdata==2024.1
120
+ - urllib3==2.2.1
121
+ - wandb==0.16.6
122
+ - werkzeug==3.0.2
123
+ - xxhash==3.4.1
124
+ - yarl==1.9.4
125
+ prefix: /fsx/sanchit/miniconda3/envs/alignment
wandb/run-20240426_143501-slqyh8h3/files/config.yaml ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.9
7
+ cli_version: 0.16.6
8
+ framework: huggingface
9
+ huggingface_version: 4.40.1
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1714142101.0
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 49
18
+ - 51
19
+ - 55
20
+ - 71
21
+ - 84
22
+ - 98
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 51
28
+ - 55
29
+ - 71
30
+ - 84
31
+ - 98
32
+ 3:
33
+ - 7
34
+ - 23
35
+ - 62
36
+ 4: 3.11.9
37
+ 5: 0.16.6
38
+ 6: 4.40.1
39
+ 8:
40
+ - 5
41
+ 9:
42
+ 1: transformers_trainer
43
+ 13: linux-x86_64
44
+ m:
45
+ - 1: train/global_step
46
+ 6:
47
+ - 3
48
+ - 1: train/epoch
49
+ 5: 1
50
+ 6:
51
+ - 1
52
+ vocab_size:
53
+ desc: null
54
+ value: 32000
55
+ max_position_embeddings:
56
+ desc: null
57
+ value: 32768
58
+ hidden_size:
59
+ desc: null
60
+ value: 4096
61
+ intermediate_size:
62
+ desc: null
63
+ value: 14336
64
+ num_hidden_layers:
65
+ desc: null
66
+ value: 6
67
+ num_attention_heads:
68
+ desc: null
69
+ value: 32
70
+ sliding_window:
71
+ desc: null
72
+ value: 4096
73
+ num_key_value_heads:
74
+ desc: null
75
+ value: 8
76
+ hidden_act:
77
+ desc: null
78
+ value: silu
79
+ initializer_range:
80
+ desc: null
81
+ value: 0.02
82
+ rms_norm_eps:
83
+ desc: null
84
+ value: 1.0e-05
85
+ use_cache:
86
+ desc: null
87
+ value: false
88
+ rope_theta:
89
+ desc: null
90
+ value: 10000.0
91
+ attention_dropout:
92
+ desc: null
93
+ value: 0.0
94
+ return_dict:
95
+ desc: null
96
+ value: true
97
+ output_hidden_states:
98
+ desc: null
99
+ value: false
100
+ output_attentions:
101
+ desc: null
102
+ value: false
103
+ torchscript:
104
+ desc: null
105
+ value: false
106
+ torch_dtype:
107
+ desc: null
108
+ value: bfloat16
109
+ use_bfloat16:
110
+ desc: null
111
+ value: false
112
+ tf_legacy_loss:
113
+ desc: null
114
+ value: false
115
+ pruned_heads:
116
+ desc: null
117
+ value: {}
118
+ tie_word_embeddings:
119
+ desc: null
120
+ value: false
121
+ chunk_size_feed_forward:
122
+ desc: null
123
+ value: 0
124
+ is_encoder_decoder:
125
+ desc: null
126
+ value: false
127
+ is_decoder:
128
+ desc: null
129
+ value: false
130
+ cross_attention_hidden_size:
131
+ desc: null
132
+ value: null
133
+ add_cross_attention:
134
+ desc: null
135
+ value: false
136
+ tie_encoder_decoder:
137
+ desc: null
138
+ value: false
139
+ max_length:
140
+ desc: null
141
+ value: 20
142
+ min_length:
143
+ desc: null
144
+ value: 0
145
+ do_sample:
146
+ desc: null
147
+ value: false
148
+ early_stopping:
149
+ desc: null
150
+ value: false
151
+ num_beams:
152
+ desc: null
153
+ value: 1
154
+ num_beam_groups:
155
+ desc: null
156
+ value: 1
157
+ diversity_penalty:
158
+ desc: null
159
+ value: 0.0
160
+ temperature:
161
+ desc: null
162
+ value: 1.0
163
+ top_k:
164
+ desc: null
165
+ value: 50
166
+ top_p:
167
+ desc: null
168
+ value: 1.0
169
+ typical_p:
170
+ desc: null
171
+ value: 1.0
172
+ repetition_penalty:
173
+ desc: null
174
+ value: 1.0
175
+ length_penalty:
176
+ desc: null
177
+ value: 1.0
178
+ no_repeat_ngram_size:
179
+ desc: null
180
+ value: 0
181
+ encoder_no_repeat_ngram_size:
182
+ desc: null
183
+ value: 0
184
+ bad_words_ids:
185
+ desc: null
186
+ value: null
187
+ num_return_sequences:
188
+ desc: null
189
+ value: 1
190
+ output_scores:
191
+ desc: null
192
+ value: false
193
+ return_dict_in_generate:
194
+ desc: null
195
+ value: false
196
+ forced_bos_token_id:
197
+ desc: null
198
+ value: null
199
+ forced_eos_token_id:
200
+ desc: null
201
+ value: null
202
+ remove_invalid_values:
203
+ desc: null
204
+ value: false
205
+ exponential_decay_length_penalty:
206
+ desc: null
207
+ value: null
208
+ suppress_tokens:
209
+ desc: null
210
+ value: null
211
+ begin_suppress_tokens:
212
+ desc: null
213
+ value: null
214
+ architectures:
215
+ desc: null
216
+ value:
217
+ - MistralForCausalLM
218
+ finetuning_task:
219
+ desc: null
220
+ value: null
221
+ id2label:
222
+ desc: null
223
+ value:
224
+ '0': LABEL_0
225
+ '1': LABEL_1
226
+ label2id:
227
+ desc: null
228
+ value:
229
+ LABEL_0: 0
230
+ LABEL_1: 1
231
+ tokenizer_class:
232
+ desc: null
233
+ value: null
234
+ prefix:
235
+ desc: null
236
+ value: null
237
+ bos_token_id:
238
+ desc: null
239
+ value: 1
240
+ pad_token_id:
241
+ desc: null
242
+ value: null
243
+ eos_token_id:
244
+ desc: null
245
+ value: 2
246
+ sep_token_id:
247
+ desc: null
248
+ value: null
249
+ decoder_start_token_id:
250
+ desc: null
251
+ value: null
252
+ task_specific_params:
253
+ desc: null
254
+ value: null
255
+ problem_type:
256
+ desc: null
257
+ value: null
258
+ _name_or_path:
259
+ desc: null
260
+ value: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat
261
+ transformers_version:
262
+ desc: null
263
+ value: 4.40.1
264
+ model_type:
265
+ desc: null
266
+ value: mistral
267
+ output_dir:
268
+ desc: null
269
+ value: ./
270
+ overwrite_output_dir:
271
+ desc: null
272
+ value: true
273
+ do_train:
274
+ desc: null
275
+ value: false
276
+ do_eval:
277
+ desc: null
278
+ value: true
279
+ do_predict:
280
+ desc: null
281
+ value: false
282
+ evaluation_strategy:
283
+ desc: null
284
+ value: epoch
285
+ prediction_loss_only:
286
+ desc: null
287
+ value: false
288
+ per_device_train_batch_size:
289
+ desc: null
290
+ value: 32
291
+ per_device_eval_batch_size:
292
+ desc: null
293
+ value: 32
294
+ per_gpu_train_batch_size:
295
+ desc: null
296
+ value: null
297
+ per_gpu_eval_batch_size:
298
+ desc: null
299
+ value: null
300
+ gradient_accumulation_steps:
301
+ desc: null
302
+ value: 1
303
+ eval_accumulation_steps:
304
+ desc: null
305
+ value: null
306
+ eval_delay:
307
+ desc: null
308
+ value: 0
309
+ learning_rate:
310
+ desc: null
311
+ value: 0.0001
312
+ weight_decay:
313
+ desc: null
314
+ value: 0.0
315
+ adam_beta1:
316
+ desc: null
317
+ value: 0.9
318
+ adam_beta2:
319
+ desc: null
320
+ value: 0.999
321
+ adam_epsilon:
322
+ desc: null
323
+ value: 1.0e-08
324
+ max_grad_norm:
325
+ desc: null
326
+ value: 1.0
327
+ num_train_epochs:
328
+ desc: null
329
+ value: 1
330
+ max_steps:
331
+ desc: null
332
+ value: -1
333
+ lr_scheduler_type:
334
+ desc: null
335
+ value: linear
336
+ lr_scheduler_kwargs:
337
+ desc: null
338
+ value: {}
339
+ warmup_ratio:
340
+ desc: null
341
+ value: 0.0
342
+ warmup_steps:
343
+ desc: null
344
+ value: 500
345
+ log_level:
346
+ desc: null
347
+ value: info
348
+ log_level_replica:
349
+ desc: null
350
+ value: warning
351
+ log_on_each_node:
352
+ desc: null
353
+ value: true
354
+ logging_dir:
355
+ desc: null
356
+ value: ./runs/Apr26_14-34-38_ip-26-0-165-24
357
+ logging_strategy:
358
+ desc: null
359
+ value: steps
360
+ logging_first_step:
361
+ desc: null
362
+ value: true
363
+ logging_steps:
364
+ desc: null
365
+ value: 25
366
+ logging_nan_inf_filter:
367
+ desc: null
368
+ value: true
369
+ save_strategy:
370
+ desc: null
371
+ value: epoch
372
+ save_steps:
373
+ desc: null
374
+ value: 500
375
+ save_total_limit:
376
+ desc: null
377
+ value: 1
378
+ save_safetensors:
379
+ desc: null
380
+ value: true
381
+ save_on_each_node:
382
+ desc: null
383
+ value: false
384
+ save_only_model:
385
+ desc: null
386
+ value: false
387
+ no_cuda:
388
+ desc: null
389
+ value: false
390
+ use_cpu:
391
+ desc: null
392
+ value: false
393
+ use_mps_device:
394
+ desc: null
395
+ value: false
396
+ seed:
397
+ desc: null
398
+ value: 42
399
+ data_seed:
400
+ desc: null
401
+ value: null
402
+ jit_mode_eval:
403
+ desc: null
404
+ value: false
405
+ use_ipex:
406
+ desc: null
407
+ value: false
408
+ bf16:
409
+ desc: null
410
+ value: true
411
+ fp16:
412
+ desc: null
413
+ value: false
414
+ fp16_opt_level:
415
+ desc: null
416
+ value: O1
417
+ half_precision_backend:
418
+ desc: null
419
+ value: auto
420
+ bf16_full_eval:
421
+ desc: null
422
+ value: false
423
+ fp16_full_eval:
424
+ desc: null
425
+ value: false
426
+ tf32:
427
+ desc: null
428
+ value: null
429
+ local_rank:
430
+ desc: null
431
+ value: 0
432
+ ddp_backend:
433
+ desc: null
434
+ value: null
435
+ tpu_num_cores:
436
+ desc: null
437
+ value: null
438
+ tpu_metrics_debug:
439
+ desc: null
440
+ value: false
441
+ debug:
442
+ desc: null
443
+ value: []
444
+ dataloader_drop_last:
445
+ desc: null
446
+ value: false
447
+ eval_steps:
448
+ desc: null
449
+ value: null
450
+ dataloader_num_workers:
451
+ desc: null
452
+ value: 0
453
+ dataloader_prefetch_factor:
454
+ desc: null
455
+ value: null
456
+ past_index:
457
+ desc: null
458
+ value: -1
459
+ run_name:
460
+ desc: null
461
+ value: ./
462
+ disable_tqdm:
463
+ desc: null
464
+ value: false
465
+ remove_unused_columns:
466
+ desc: null
467
+ value: true
468
+ label_names:
469
+ desc: null
470
+ value: null
471
+ load_best_model_at_end:
472
+ desc: null
473
+ value: false
474
+ metric_for_best_model:
475
+ desc: null
476
+ value: null
477
+ greater_is_better:
478
+ desc: null
479
+ value: null
480
+ ignore_data_skip:
481
+ desc: null
482
+ value: false
483
+ fsdp:
484
+ desc: null
485
+ value: []
486
+ fsdp_min_num_params:
487
+ desc: null
488
+ value: 0
489
+ fsdp_config:
490
+ desc: null
491
+ value:
492
+ min_num_params: 0
493
+ xla: false
494
+ xla_fsdp_v2: false
495
+ xla_fsdp_grad_ckpt: false
496
+ fsdp_transformer_layer_cls_to_wrap:
497
+ desc: null
498
+ value: null
499
+ accelerator_config:
500
+ desc: null
501
+ value:
502
+ split_batches: false
503
+ dispatch_batches: null
504
+ even_batches: true
505
+ use_seedable_sampler: true
506
+ gradient_accumulation_kwargs: null
507
+ deepspeed:
508
+ desc: null
509
+ value: null
510
+ label_smoothing_factor:
511
+ desc: null
512
+ value: 0.0
513
+ optim:
514
+ desc: null
515
+ value: adamw_torch
516
+ optim_args:
517
+ desc: null
518
+ value: null
519
+ adafactor:
520
+ desc: null
521
+ value: false
522
+ group_by_length:
523
+ desc: null
524
+ value: false
525
+ length_column_name:
526
+ desc: null
527
+ value: length
528
+ report_to:
529
+ desc: null
530
+ value:
531
+ - tensorboard
532
+ - wandb
533
+ ddp_find_unused_parameters:
534
+ desc: null
535
+ value: null
536
+ ddp_bucket_cap_mb:
537
+ desc: null
538
+ value: null
539
+ ddp_broadcast_buffers:
540
+ desc: null
541
+ value: null
542
+ dataloader_pin_memory:
543
+ desc: null
544
+ value: true
545
+ dataloader_persistent_workers:
546
+ desc: null
547
+ value: false
548
+ skip_memory_metrics:
549
+ desc: null
550
+ value: true
551
+ use_legacy_prediction_loop:
552
+ desc: null
553
+ value: false
554
+ push_to_hub:
555
+ desc: null
556
+ value: true
557
+ resume_from_checkpoint:
558
+ desc: null
559
+ value: null
560
+ hub_model_id:
561
+ desc: null
562
+ value: null
563
+ hub_strategy:
564
+ desc: null
565
+ value: every_save
566
+ hub_token:
567
+ desc: null
568
+ value: <HUB_TOKEN>
569
+ hub_private_repo:
570
+ desc: null
571
+ value: false
572
+ hub_always_push:
573
+ desc: null
574
+ value: false
575
+ gradient_checkpointing:
576
+ desc: null
577
+ value: true
578
+ gradient_checkpointing_kwargs:
579
+ desc: null
580
+ value:
581
+ use_reentrant: false
582
+ include_inputs_for_metrics:
583
+ desc: null
584
+ value: false
585
+ eval_do_concat_batches:
586
+ desc: null
587
+ value: true
588
+ fp16_backend:
589
+ desc: null
590
+ value: auto
591
+ push_to_hub_model_id:
592
+ desc: null
593
+ value: null
594
+ push_to_hub_organization:
595
+ desc: null
596
+ value: null
597
+ push_to_hub_token:
598
+ desc: null
599
+ value: <PUSH_TO_HUB_TOKEN>
600
+ mp_parameters:
601
+ desc: null
602
+ value: ''
603
+ auto_find_batch_size:
604
+ desc: null
605
+ value: false
606
+ full_determinism:
607
+ desc: null
608
+ value: false
609
+ torchdynamo:
610
+ desc: null
611
+ value: null
612
+ ray_scope:
613
+ desc: null
614
+ value: last
615
+ ddp_timeout:
616
+ desc: null
617
+ value: 7200
618
+ torch_compile:
619
+ desc: null
620
+ value: false
621
+ torch_compile_backend:
622
+ desc: null
623
+ value: null
624
+ torch_compile_mode:
625
+ desc: null
626
+ value: null
627
+ dispatch_batches:
628
+ desc: null
629
+ value: null
630
+ split_batches:
631
+ desc: null
632
+ value: null
633
+ include_tokens_per_second:
634
+ desc: null
635
+ value: false
636
+ include_num_input_tokens_seen:
637
+ desc: null
638
+ value: false
639
+ neftune_noise_alpha:
640
+ desc: null
641
+ value: null
642
+ optim_target_modules:
643
+ desc: null
644
+ value: null
645
+ max_seq_length:
646
+ desc: null
647
+ value: 2048
wandb/run-20240426_143501-slqyh8h3/files/output.log ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0%| | 0/545 [00:00<?, ?it/s][INFO|trainer.py:2316] 2024-04-26 14:35:05,009 >>
2
+ Training completed. Do not forget to share your model on huggingface.co/models =)
3
+ 0%| | 0/545 [00:00<?, ?it/s]
4
+ [INFO|trainer.py:3614] 2024-04-26 14:35:05,045 >> ***** Running Evaluation *****
5
+ [INFO|trainer.py:3616] 2024-04-26 14:35:05,045 >> Num examples = 15431
6
+ [INFO|trainer.py:3619] 2024-04-26 14:35:05,045 >> Batch size = 32
7
+ 3%|▎ | 2/61 [00:00<00:04, 13.10it/s]
8
+ {'train_runtime': 4.301, 'train_samples_per_second': 32416.867, 'train_steps_per_second': 126.714, 'train_loss': 0.0, 'epoch': 1.0}
9
+ ***** train metrics *****
10
+ epoch = 1.0
11
+ total_flos = 2298816620GF
12
+ train_loss = 0.0
13
+ train_runtime = 0:00:04.30
14
+ train_samples = 207865
15
+ train_samples_per_second = 32416.867
16
+ train_steps_per_second = 126.714
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+ 97%|█████████▋| 59/61 [00:28<00:00, 2.01it/s]
32
+ ***** eval metrics *****
33
+ epoch = 1.0
34
+ eval_loss = 1.1555
35
+ eval_runtime = 0:00:30.64
36
+ eval_samples = 23110
37
+ eval_samples_per_second = 503.5
38
+ eval_steps_per_second = 1.99
39
+ 100%|██████████| 61/61 [00:29<00:00, 2.05it/s]
40
+ [INFO|trainer.py:3305] 2024-04-26 14:35:35,719 >> Saving model checkpoint to ./
41
+ [INFO|configuration_utils.py:471] 2024-04-26 14:35:35,721 >> Configuration saved in ./config.json
42
+ [INFO|configuration_utils.py:697] 2024-04-26 14:35:35,723 >> Configuration saved in ./generation_config.json
43
+ [INFO|modeling_utils.py:2590] 2024-04-26 14:35:40,968 >> Model weights saved in ./model.safetensors
44
+ [INFO|tokenization_utils_base.py:2488] 2024-04-26 14:35:40,972 >> tokenizer config file saved in ./tokenizer_config.json
45
+ [INFO|tokenization_utils_base.py:2497] 2024-04-26 14:35:40,974 >> Special tokens file saved in ./special_tokens_map.json
46
+ [INFO|trainer.py:3305] 2024-04-26 14:35:41,000 >> Saving model checkpoint to ./
47
+ [INFO|configuration_utils.py:471] 2024-04-26 14:35:41,002 >> Configuration saved in ./config.json
48
+ [INFO|configuration_utils.py:697] 2024-04-26 14:35:41,004 >> Configuration saved in ./generation_config.json
49
+ [INFO|modeling_utils.py:2590] 2024-04-26 14:35:46,351 >> Model weights saved in ./model.safetensors
50
+ [INFO|tokenization_utils_base.py:2488] 2024-04-26 14:35:46,354 >> tokenizer config file saved in ./tokenizer_config.json
51
+ [INFO|tokenization_utils_base.py:2497] 2024-04-26 14:35:46,355 >> Special tokens file saved in ./special_tokens_map.json
52
+ [INFO|modelcard.py:450] 2024-04-26 14:35:46,403 >> Dropping the following result as it does not have all the necessary fields:
wandb/run-20240426_143501-slqyh8h3/files/requirements.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GitPython==3.1.43
2
+ Jinja2==3.1.3
3
+ Markdown==3.6
4
+ MarkupSafe==2.1.5
5
+ PyYAML==6.0.1
6
+ Pygments==2.17.2
7
+ Werkzeug==3.0.2
8
+ absl-py==2.1.0
9
+ accelerate==0.29.3
10
+ aiohttp==3.9.5
11
+ aiosignal==1.3.1
12
+ alignment-handbook==0.4.0.dev0
13
+ annotated-types==0.6.0
14
+ appdirs==1.4.4
15
+ attrs==23.2.0
16
+ bitsandbytes==0.43.1
17
+ certifi==2024.2.2
18
+ charset-normalizer==3.3.2
19
+ click==8.1.7
20
+ datasets==2.19.0
21
+ deepspeed==0.14.2
22
+ dill==0.3.8
23
+ docker-pycreds==0.4.0
24
+ docstring_parser==0.16
25
+ einops==0.7.0
26
+ evaluate==0.4.1
27
+ filelock==3.13.4
28
+ frozenlist==1.4.1
29
+ fsspec==2024.3.1
30
+ gitdb==4.0.11
31
+ grpcio==1.62.2
32
+ hf_transfer==0.1.6
33
+ hjson==3.1.0
34
+ huggingface-hub==0.22.2
35
+ idna==3.7
36
+ markdown-it-py==3.0.0
37
+ mdurl==0.1.2
38
+ mpmath==1.3.0
39
+ multidict==6.0.5
40
+ multiprocess==0.70.16
41
+ networkx==3.3
42
+ ninja==1.11.1.1
43
+ numpy==1.26.4
44
+ nvidia-cublas-cu12==12.1.3.1
45
+ nvidia-cuda-cupti-cu12==12.1.105
46
+ nvidia-cuda-nvrtc-cu12==12.1.105
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ nvidia-cudnn-cu12==8.9.2.26
49
+ nvidia-cufft-cu12==11.0.2.54
50
+ nvidia-curand-cu12==10.3.2.106
51
+ nvidia-cusolver-cu12==11.4.5.107
52
+ nvidia-cusparse-cu12==12.1.0.106
53
+ nvidia-nccl-cu12==2.19.3
54
+ nvidia-nvjitlink-cu12==12.4.127
55
+ nvidia-nvtx-cu12==12.1.105
56
+ packaging==24.0
57
+ pandas==2.2.2
58
+ peft==0.10.0
59
+ pillow==10.3.0
60
+ pip==23.3.1
61
+ protobuf==3.20.2
62
+ psutil==5.9.8
63
+ py-cpuinfo==9.0.0
64
+ pyarrow-hotfix==0.6
65
+ pyarrow==16.0.0
66
+ pydantic==2.7.1
67
+ pydantic_core==2.18.2
68
+ pynvml==11.5.0
69
+ python-dateutil==2.9.0.post0
70
+ pytz==2024.1
71
+ regex==2024.4.16
72
+ requests==2.31.0
73
+ responses==0.18.0
74
+ rich==13.7.1
75
+ safetensors==0.4.3
76
+ scipy==1.13.0
77
+ sentencepiece==0.2.0
78
+ sentry-sdk==2.0.0
79
+ setproctitle==1.3.3
80
+ setuptools==68.2.2
81
+ shtab==1.7.1
82
+ six==1.16.0
83
+ smmap==5.0.1
84
+ sympy==1.12
85
+ tensorboard-data-server==0.7.2
86
+ tensorboard==2.16.2
87
+ tokenizers==0.19.1
88
+ torch==2.2.2
89
+ torchaudio==2.2.2
90
+ torchvision==0.17.2
91
+ tqdm==4.66.2
92
+ transformers==4.40.1
93
+ triton==2.2.0
94
+ trl==0.8.6
95
+ typing_extensions==4.11.0
96
+ tyro==0.8.3
97
+ tzdata==2024.1
98
+ urllib3==2.2.1
99
+ wandb==0.16.6
100
+ wheel==0.41.2
101
+ xxhash==3.4.1
102
+ yarl==1.9.4
wandb/run-20240426_143501-slqyh8h3/files/wandb-metadata.json ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
3
+ "python": "3.11.9",
4
+ "heartbeatAt": "2024-04-26T14:35:02.331727",
5
+ "startedAt": "2024-04-26T14:35:01.958802",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "./config_200k.yaml"
10
+ ],
11
+ "state": "running",
12
+ "program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py",
13
+ "codePathLocal": "run_sft.py",
14
+ "codePath": "run_sft.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k",
17
+ "commit": "37a86cac6473b5416859c9fc849f41892a2da4b3"
18
+ },
19
+ "email": null,
20
+ "root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k",
21
+ "host": "ip-26-0-165-24",
22
+ "username": "sanchit",
23
+ "executable": "/fsx/sanchit/miniconda3/envs/alignment/bin/python",
24
+ "cpu_count": 96,
25
+ "cpu_count_logical": 96,
26
+ "cpu_freq": {
27
+ "current": 2726.2313124999996,
28
+ "min": 0.0,
29
+ "max": 0.0
30
+ },
31
+ "cpu_freq_per_core": [
32
+ {
33
+ "current": 2650.0,
34
+ "min": 0.0,
35
+ "max": 0.0
36
+ },
37
+ {
38
+ "current": 2650.0,
39
+ "min": 0.0,
40
+ "max": 0.0
41
+ },
42
+ {
43
+ "current": 2650.0,
44
+ "min": 0.0,
45
+ "max": 0.0
46
+ },
47
+ {
48
+ "current": 2650.0,
49
+ "min": 0.0,
50
+ "max": 0.0
51
+ },
52
+ {
53
+ "current": 2650.0,
54
+ "min": 0.0,
55
+ "max": 0.0
56
+ },
57
+ {
58
+ "current": 2650.0,
59
+ "min": 0.0,
60
+ "max": 0.0
61
+ },
62
+ {
63
+ "current": 2650.0,
64
+ "min": 0.0,
65
+ "max": 0.0
66
+ },
67
+ {
68
+ "current": 2650.0,
69
+ "min": 0.0,
70
+ "max": 0.0
71
+ },
72
+ {
73
+ "current": 2650.0,
74
+ "min": 0.0,
75
+ "max": 0.0
76
+ },
77
+ {
78
+ "current": 2650.0,
79
+ "min": 0.0,
80
+ "max": 0.0
81
+ },
82
+ {
83
+ "current": 2650.0,
84
+ "min": 0.0,
85
+ "max": 0.0
86
+ },
87
+ {
88
+ "current": 2650.0,
89
+ "min": 0.0,
90
+ "max": 0.0
91
+ },
92
+ {
93
+ "current": 2650.0,
94
+ "min": 0.0,
95
+ "max": 0.0
96
+ },
97
+ {
98
+ "current": 2650.0,
99
+ "min": 0.0,
100
+ "max": 0.0
101
+ },
102
+ {
103
+ "current": 3369.271,
104
+ "min": 0.0,
105
+ "max": 0.0
106
+ },
107
+ {
108
+ "current": 2650.0,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2650.0,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2650.0,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2650.0,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2650.0,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2650.0,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2650.0,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2650.0,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2650.0,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2650.0,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2650.0,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2650.0,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2650.0,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2650.0,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2650.0,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2650.0,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2650.0,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 3567.605,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2650.0,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ },
202
+ {
203
+ "current": 2650.0,
204
+ "min": 0.0,
205
+ "max": 0.0
206
+ },
207
+ {
208
+ "current": 2650.0,
209
+ "min": 0.0,
210
+ "max": 0.0
211
+ },
212
+ {
213
+ "current": 2650.0,
214
+ "min": 0.0,
215
+ "max": 0.0
216
+ },
217
+ {
218
+ "current": 2650.0,
219
+ "min": 0.0,
220
+ "max": 0.0
221
+ },
222
+ {
223
+ "current": 2650.0,
224
+ "min": 0.0,
225
+ "max": 0.0
226
+ },
227
+ {
228
+ "current": 2650.0,
229
+ "min": 0.0,
230
+ "max": 0.0
231
+ },
232
+ {
233
+ "current": 2650.0,
234
+ "min": 0.0,
235
+ "max": 0.0
236
+ },
237
+ {
238
+ "current": 2650.0,
239
+ "min": 0.0,
240
+ "max": 0.0
241
+ },
242
+ {
243
+ "current": 2650.0,
244
+ "min": 0.0,
245
+ "max": 0.0
246
+ },
247
+ {
248
+ "current": 2650.0,
249
+ "min": 0.0,
250
+ "max": 0.0
251
+ },
252
+ {
253
+ "current": 2650.0,
254
+ "min": 0.0,
255
+ "max": 0.0
256
+ },
257
+ {
258
+ "current": 2650.0,
259
+ "min": 0.0,
260
+ "max": 0.0
261
+ },
262
+ {
263
+ "current": 2650.0,
264
+ "min": 0.0,
265
+ "max": 0.0
266
+ },
267
+ {
268
+ "current": 2650.0,
269
+ "min": 0.0,
270
+ "max": 0.0
271
+ },
272
+ {
273
+ "current": 2650.0,
274
+ "min": 0.0,
275
+ "max": 0.0
276
+ },
277
+ {
278
+ "current": 2650.0,
279
+ "min": 0.0,
280
+ "max": 0.0
281
+ },
282
+ {
283
+ "current": 2650.0,
284
+ "min": 0.0,
285
+ "max": 0.0
286
+ },
287
+ {
288
+ "current": 2650.0,
289
+ "min": 0.0,
290
+ "max": 0.0
291
+ },
292
+ {
293
+ "current": 2650.0,
294
+ "min": 0.0,
295
+ "max": 0.0
296
+ },
297
+ {
298
+ "current": 3598.267,
299
+ "min": 0.0,
300
+ "max": 0.0
301
+ },
302
+ {
303
+ "current": 2650.0,
304
+ "min": 0.0,
305
+ "max": 0.0
306
+ },
307
+ {
308
+ "current": 2650.0,
309
+ "min": 0.0,
310
+ "max": 0.0
311
+ },
312
+ {
313
+ "current": 3597.129,
314
+ "min": 0.0,
315
+ "max": 0.0
316
+ },
317
+ {
318
+ "current": 2650.0,
319
+ "min": 0.0,
320
+ "max": 0.0
321
+ },
322
+ {
323
+ "current": 2650.0,
324
+ "min": 0.0,
325
+ "max": 0.0
326
+ },
327
+ {
328
+ "current": 2650.0,
329
+ "min": 0.0,
330
+ "max": 0.0
331
+ },
332
+ {
333
+ "current": 2650.0,
334
+ "min": 0.0,
335
+ "max": 0.0
336
+ },
337
+ {
338
+ "current": 2650.0,
339
+ "min": 0.0,
340
+ "max": 0.0
341
+ },
342
+ {
343
+ "current": 2650.0,
344
+ "min": 0.0,
345
+ "max": 0.0
346
+ },
347
+ {
348
+ "current": 2650.0,
349
+ "min": 0.0,
350
+ "max": 0.0
351
+ },
352
+ {
353
+ "current": 2650.0,
354
+ "min": 0.0,
355
+ "max": 0.0
356
+ },
357
+ {
358
+ "current": 2650.0,
359
+ "min": 0.0,
360
+ "max": 0.0
361
+ },
362
+ {
363
+ "current": 2650.0,
364
+ "min": 0.0,
365
+ "max": 0.0
366
+ },
367
+ {
368
+ "current": 2650.0,
369
+ "min": 0.0,
370
+ "max": 0.0
371
+ },
372
+ {
373
+ "current": 3597.777,
374
+ "min": 0.0,
375
+ "max": 0.0
376
+ },
377
+ {
378
+ "current": 2650.0,
379
+ "min": 0.0,
380
+ "max": 0.0
381
+ },
382
+ {
383
+ "current": 2650.0,
384
+ "min": 0.0,
385
+ "max": 0.0
386
+ },
387
+ {
388
+ "current": 2650.0,
389
+ "min": 0.0,
390
+ "max": 0.0
391
+ },
392
+ {
393
+ "current": 2650.0,
394
+ "min": 0.0,
395
+ "max": 0.0
396
+ },
397
+ {
398
+ "current": 2650.0,
399
+ "min": 0.0,
400
+ "max": 0.0
401
+ },
402
+ {
403
+ "current": 2650.0,
404
+ "min": 0.0,
405
+ "max": 0.0
406
+ },
407
+ {
408
+ "current": 2650.0,
409
+ "min": 0.0,
410
+ "max": 0.0
411
+ },
412
+ {
413
+ "current": 2650.0,
414
+ "min": 0.0,
415
+ "max": 0.0
416
+ },
417
+ {
418
+ "current": 2650.0,
419
+ "min": 0.0,
420
+ "max": 0.0
421
+ },
422
+ {
423
+ "current": 3593.395,
424
+ "min": 0.0,
425
+ "max": 0.0
426
+ },
427
+ {
428
+ "current": 2650.0,
429
+ "min": 0.0,
430
+ "max": 0.0
431
+ },
432
+ {
433
+ "current": 2650.0,
434
+ "min": 0.0,
435
+ "max": 0.0
436
+ },
437
+ {
438
+ "current": 3597.941,
439
+ "min": 0.0,
440
+ "max": 0.0
441
+ },
442
+ {
443
+ "current": 2650.0,
444
+ "min": 0.0,
445
+ "max": 0.0
446
+ },
447
+ {
448
+ "current": 2650.0,
449
+ "min": 0.0,
450
+ "max": 0.0
451
+ },
452
+ {
453
+ "current": 2650.0,
454
+ "min": 0.0,
455
+ "max": 0.0
456
+ },
457
+ {
458
+ "current": 2650.0,
459
+ "min": 0.0,
460
+ "max": 0.0
461
+ },
462
+ {
463
+ "current": 2650.0,
464
+ "min": 0.0,
465
+ "max": 0.0
466
+ },
467
+ {
468
+ "current": 2650.0,
469
+ "min": 0.0,
470
+ "max": 0.0
471
+ },
472
+ {
473
+ "current": 2650.0,
474
+ "min": 0.0,
475
+ "max": 0.0
476
+ },
477
+ {
478
+ "current": 3596.821,
479
+ "min": 0.0,
480
+ "max": 0.0
481
+ },
482
+ {
483
+ "current": 2650.0,
484
+ "min": 0.0,
485
+ "max": 0.0
486
+ },
487
+ {
488
+ "current": 2650.0,
489
+ "min": 0.0,
490
+ "max": 0.0
491
+ },
492
+ {
493
+ "current": 2650.0,
494
+ "min": 0.0,
495
+ "max": 0.0
496
+ },
497
+ {
498
+ "current": 2650.0,
499
+ "min": 0.0,
500
+ "max": 0.0
501
+ },
502
+ {
503
+ "current": 2650.0,
504
+ "min": 0.0,
505
+ "max": 0.0
506
+ },
507
+ {
508
+ "current": 2650.0,
509
+ "min": 0.0,
510
+ "max": 0.0
511
+ }
512
+ ],
513
+ "disk": {
514
+ "/": {
515
+ "total": 290.7472343444824,
516
+ "used": 58.3209114074707
517
+ }
518
+ },
519
+ "gpu": "NVIDIA H100 80GB HBM3",
520
+ "gpu_count": 8,
521
+ "gpu_devices": [
522
+ {
523
+ "name": "NVIDIA H100 80GB HBM3",
524
+ "memory_total": 85520809984
525
+ },
526
+ {
527
+ "name": "NVIDIA H100 80GB HBM3",
528
+ "memory_total": 85520809984
529
+ },
530
+ {
531
+ "name": "NVIDIA H100 80GB HBM3",
532
+ "memory_total": 85520809984
533
+ },
534
+ {
535
+ "name": "NVIDIA H100 80GB HBM3",
536
+ "memory_total": 85520809984
537
+ },
538
+ {
539
+ "name": "NVIDIA H100 80GB HBM3",
540
+ "memory_total": 85520809984
541
+ },
542
+ {
543
+ "name": "NVIDIA H100 80GB HBM3",
544
+ "memory_total": 85520809984
545
+ },
546
+ {
547
+ "name": "NVIDIA H100 80GB HBM3",
548
+ "memory_total": 85520809984
549
+ },
550
+ {
551
+ "name": "NVIDIA H100 80GB HBM3",
552
+ "memory_total": 85520809984
553
+ }
554
+ ],
555
+ "memory": {
556
+ "total": 1999.9855346679688
557
+ }
558
+ }
wandb/run-20240426_143501-slqyh8h3/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train_runtime": 4.301, "train_samples_per_second": 32416.867, "train_steps_per_second": 126.714, "total_flos": 2.468335550600315e+18, "train_loss": 0.0, "train/epoch": 1.0, "train/global_step": 545, "_timestamp": 1714142135.713323, "_runtime": 33.74007821083069, "_step": 1, "eval/loss": 1.1554738283157349, "eval/runtime": 30.6475, "eval/samples_per_second": 503.5, "eval/steps_per_second": 1.99}
wandb/run-20240426_143501-slqyh8h3/logs/debug-internal.log ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 14:35:01,974 INFO StreamThr :907589 [internal.py:wandb_internal():86] W&B internal server running at pid: 907589, started at: 2024-04-26 14:35:01.973422
2
+ 2024-04-26 14:35:01,976 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status
3
+ 2024-04-26 14:35:01,977 INFO WriterThread:907589 [datastore.py:open_for_write():87] open: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/run-slqyh8h3.wandb
4
+ 2024-04-26 14:35:01,980 DEBUG SenderThread:907589 [sender.py:send():379] send: header
5
+ 2024-04-26 14:35:01,994 DEBUG SenderThread:907589 [sender.py:send():379] send: run
6
+ 2024-04-26 14:35:02,153 INFO SenderThread:907589 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files
7
+ 2024-04-26 14:35:02,153 INFO SenderThread:907589 [sender.py:_start_run_threads():1124] run started: slqyh8h3 with start time 1714142101.973245
8
+ 2024-04-26 14:35:02,163 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-04-26 14:35:02,164 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: check_version
10
+ 2024-04-26 14:35:02,222 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-04-26 14:35:02,282 DEBUG HandlerThread:907589 [system_info.py:__init__():26] System info init
12
+ 2024-04-26 14:35:02,282 DEBUG HandlerThread:907589 [system_info.py:__init__():41] System info init done
13
+ 2024-04-26 14:35:02,282 INFO HandlerThread:907589 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-04-26 14:35:02,282 INFO SystemMonitor:907589 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-04-26 14:35:02,282 INFO HandlerThread:907589 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-04-26 14:35:02,283 INFO SystemMonitor:907589 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-04-26 14:35:02,283 INFO SystemMonitor:907589 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-04-26 14:35:02,284 INFO SystemMonitor:907589 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-04-26 14:35:02,284 INFO SystemMonitor:907589 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-04-26 14:35:02,285 INFO SystemMonitor:907589 [interfaces.py:start():190] Started network monitoring
21
+ 2024-04-26 14:35:02,331 DEBUG HandlerThread:907589 [system_info.py:probe():150] Probing system
22
+ 2024-04-26 14:35:02,334 DEBUG HandlerThread:907589 [system_info.py:_probe_git():135] Probing git
23
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_info.py:_probe_git():143] Probing git done
24
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_info.py:probe():198] Probing system done
25
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.9', 'heartbeatAt': '2024-04-26T14:35:02.331727', 'startedAt': '2024-04-26T14:35:01.958802', 'docker': None, 'cuda': None, 'args': ('./config_200k.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py', 'codePathLocal': 'run_sft.py', 'codePath': 'run_sft.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k', 'commit': '37a86cac6473b5416859c9fc849f41892a2da4b3'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k', 'host': 'ip-26-0-165-24', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/alignment/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2726.2313124999996, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3369.271, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3567.605, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.267, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.129, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.777, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3593.395, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.941, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3596.821, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 58.3209114074707}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855346679688}}
26
+ 2024-04-26 14:35:02,354 INFO HandlerThread:907589 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-04-26 14:35:02,354 INFO HandlerThread:907589 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-04-26 14:35:02,354 DEBUG HandlerThread:907589 [system_info.py:_save_conda():207] Saving list of conda packages installed into the current environment
29
+ 2024-04-26 14:35:03,156 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/conda-environment.yaml
30
+ 2024-04-26 14:35:04,823 DEBUG HandlerThread:907589 [system_info.py:_save_conda():222] Saving conda packages done
31
+ 2024-04-26 14:35:04,825 INFO HandlerThread:907589 [system_monitor.py:probe():229] Finished publishing system info
32
+ 2024-04-26 14:35:04,843 DEBUG SenderThread:907589 [sender.py:send():379] send: files
33
+ 2024-04-26 14:35:04,843 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-metadata.json with policy now
34
+ 2024-04-26 14:35:04,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: python_packages
35
+ 2024-04-26 14:35:04,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: stop_status
36
+ 2024-04-26 14:35:04,988 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: python_packages
37
+ 2024-04-26 14:35:04,989 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: internal_messages
38
+ 2024-04-26 14:35:04,991 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: stop_status
39
+ 2024-04-26 14:35:05,036 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: partial_history
40
+ 2024-04-26 14:35:05,104 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
41
+ 2024-04-26 14:35:05,105 DEBUG SenderThread:907589 [sender.py:send():379] send: config
42
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
43
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
44
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
45
+ 2024-04-26 14:35:05,106 WARNING SenderThread:907589 [sender.py:send_metric():1341] Seen metric with glob (shouldn't happen)
46
+ 2024-04-26 14:35:05,106 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
47
+ 2024-04-26 14:35:05,107 DEBUG SenderThread:907589 [sender.py:send():379] send: telemetry
48
+ 2024-04-26 14:35:05,107 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
49
+ 2024-04-26 14:35:05,110 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
50
+ 2024-04-26 14:35:05,110 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
51
+ 2024-04-26 14:35:05,111 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
52
+ 2024-04-26 14:35:05,111 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
53
+ 2024-04-26 14:35:05,112 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
54
+ 2024-04-26 14:35:05,112 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
55
+ 2024-04-26 14:35:05,113 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
56
+ 2024-04-26 14:35:05,114 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
57
+ 2024-04-26 14:35:05,115 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
58
+ 2024-04-26 14:35:05,115 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
59
+ 2024-04-26 14:35:05,115 DEBUG SenderThread:907589 [sender.py:send():379] send: history
60
+ 2024-04-26 14:35:05,115 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
61
+ 2024-04-26 14:35:05,116 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
62
+ 2024-04-26 14:35:05,148 INFO wandb-upload_0:907589 [upload_job.py:push():131] Uploaded file /tmp/tmpjmlr9sepwandb/cfal2aup-wandb-metadata.json
63
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/conda-environment.yaml
64
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/wandb-metadata.json
65
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/requirements.txt
66
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/wandb-summary.json
67
+ 2024-04-26 14:35:05,158 INFO Thread-12 :907589 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
68
+ 2024-04-26 14:35:07,030 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
69
+ 2024-04-26 14:35:07,160 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
70
+ 2024-04-26 14:35:09,162 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
71
+ 2024-04-26 14:35:11,164 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
72
+ 2024-04-26 14:35:12,475 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
73
+ 2024-04-26 14:35:13,166 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
74
+ 2024-04-26 14:35:15,168 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
75
+ 2024-04-26 14:35:17,170 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
76
+ 2024-04-26 14:35:17,923 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
77
+ 2024-04-26 14:35:19,172 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
78
+ 2024-04-26 14:35:19,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: stop_status
79
+ 2024-04-26 14:35:19,989 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: stop_status
80
+ 2024-04-26 14:35:19,990 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: internal_messages
81
+ 2024-04-26 14:35:21,174 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
82
+ 2024-04-26 14:35:23,176 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
83
+ 2024-04-26 14:35:23,382 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
84
+ 2024-04-26 14:35:25,177 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
85
+ 2024-04-26 14:35:27,179 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
86
+ 2024-04-26 14:35:28,842 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
87
+ 2024-04-26 14:35:29,181 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
88
+ 2024-04-26 14:35:31,183 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
89
+ 2024-04-26 14:35:33,185 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
90
+ 2024-04-26 14:35:34,309 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
91
+ 2024-04-26 14:35:34,988 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: stop_status
92
+ 2024-04-26 14:35:34,989 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: stop_status
93
+ 2024-04-26 14:35:34,990 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: internal_messages
94
+ 2024-04-26 14:35:35,188 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/config.yaml
95
+ 2024-04-26 14:35:35,188 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
96
+ 2024-04-26 14:35:35,714 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: partial_history
97
+ 2024-04-26 14:35:35,715 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
98
+ 2024-04-26 14:35:35,716 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
99
+ 2024-04-26 14:35:35,717 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
100
+ 2024-04-26 14:35:35,717 DEBUG SenderThread:907589 [sender.py:send():379] send: metric
101
+ 2024-04-26 14:35:35,718 DEBUG SenderThread:907589 [sender.py:send():379] send: history
102
+ 2024-04-26 14:35:35,718 DEBUG SenderThread:907589 [sender.py:send_request():406] send_request: summary_record
103
+ 2024-04-26 14:35:35,720 INFO SenderThread:907589 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
104
+ 2024-04-26 14:35:36,191 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/wandb-summary.json
105
+ 2024-04-26 14:35:37,192 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
106
+ 2024-04-26 14:35:39,724 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
107
+ 2024-04-26 14:35:43,200 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
108
+ 2024-04-26 14:35:45,005 DEBUG HandlerThread:907589 [handler.py:handle_request():146] handle_request: status_report
109
+ 2024-04-26 14:35:47,205 INFO Thread-12 :907589 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/files/output.log
wandb/run-20240426_143501-slqyh8h3/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 14:35:01,967 INFO MainThread:907122 [wandb_setup.py:_flush():76] Current SDK version is 0.16.6
2
+ 2024-04-26 14:35:01,967 INFO MainThread:907122 [wandb_setup.py:_flush():76] Configure stats pid to 907122
3
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/settings
5
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/./run_sft.py'}
8
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:_log_setup():521] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/logs/debug.log
10
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:_log_setup():522] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat-200k/wandb/run-20240426_143501-slqyh8h3/logs/debug-internal.log
11
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():561] calling init triggers
12
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():568] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():611] starting backend
15
+ 2024-04-26 14:35:01,968 INFO MainThread:907122 [wandb_init.py:init():615] setting up manager
16
+ 2024-04-26 14:35:01,971 INFO MainThread:907122 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-04-26 14:35:01,972 INFO MainThread:907122 [wandb_init.py:init():623] backend started and connected
18
+ 2024-04-26 14:35:01,975 INFO MainThread:907122 [wandb_init.py:init():715] updated telemetry
19
+ 2024-04-26 14:35:01,992 INFO MainThread:907122 [wandb_init.py:init():748] communicating run to backend with 90.0 second timeout
20
+ 2024-04-26 14:35:02,163 INFO MainThread:907122 [wandb_run.py:_on_init():2357] communicating current version
21
+ 2024-04-26 14:35:02,214 INFO MainThread:907122 [wandb_run.py:_on_init():2366] got version response
22
+ 2024-04-26 14:35:02,214 INFO MainThread:907122 [wandb_init.py:init():799] starting run threads in backend
23
+ 2024-04-26 14:35:04,988 INFO MainThread:907122 [wandb_run.py:_console_start():2335] atexit reg
24
+ 2024-04-26 14:35:04,988 INFO MainThread:907122 [wandb_run.py:_redirect():2190] redirect: wrap_raw
25
+ 2024-04-26 14:35:04,988 INFO MainThread:907122 [wandb_run.py:_redirect():2255] Wrapping output streams.
26
+ 2024-04-26 14:35:04,989 INFO MainThread:907122 [wandb_run.py:_redirect():2280] Redirects installed.
27
+ 2024-04-26 14:35:04,990 INFO MainThread:907122 [wandb_init.py:init():842] run started, returning control to user process
28
+ 2024-04-26 14:35:04,991 INFO MainThread:907122 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_14-34-38_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
wandb/run-20240426_143501-slqyh8h3/run-slqyh8h3.wandb ADDED
File without changes