pvbhanuteja commited on
Commit
a4c0495
·
verified ·
1 Parent(s): d0be80d

Training in progress, step 300

Browse files
archive_checkpoints.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Path to the parent directory containing checkpoint folders
4
+ SOURCE_DIR="/disk2/bhanu/research/open-r1-main/data/gemma-3-12b-it-sft"
5
+
6
+ # Azure destination
7
+ AZURE_DEST="azai:private-model-checkpoints/gemma-3-12b-it-sft-graphllm"
8
+
9
+ # Log file
10
+ LOG_FILE="rclone_archive.log"
11
+
12
+ # Get current date and time for logging
13
+ CURRENT_TIME=$(date "+%Y-%m-%d %H:%M:%S")
14
+
15
+ echo "[$CURRENT_TIME] Starting checkpoint archiving" >> $LOG_FILE
16
+
17
+ # Use rclone copy with optimizations to avoid unnecessary transfers
18
+ rclone copy "$SOURCE_DIR" "$AZURE_DEST" \
19
+ --progress \
20
+ --include "checkpoint-*/**" \
21
+ --log-file=$LOG_FILE \
22
+ --stats 10s \
23
+ --checksum \
24
+ --skip-links \
25
+ --transfers=4 \
26
+ --checkers=8
27
+
28
+ # Log completion
29
+ CURRENT_TIME=$(date "+%Y-%m-%d %H:%M:%S")
30
+ echo "[$CURRENT_TIME] Checkpoint archiving completed" >> $LOG_FILE
31
+
32
+ # Optional: List all checkpoints currently in Azure for verification
33
+ echo "[$CURRENT_TIME] Current checkpoints in Azure:" >> $LOG_FILE
34
+ rclone lsf "$AZURE_DEST" --include "checkpoint-*/" >> $LOG_FILE
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32cac6b328e0e8196e5c7bbeeaba72f60cf0fd5959a10c68e5cf4692558ba1e6
3
  size 4979902192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eddea002f1ec84ae617382569db432bfecf203e7bede1ef6e4bba93f64e3b3b4
3
  size 4979902192
model-00002-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e15276f11f27f85fc5d5853be1f9e51fd06d13ab6722517a5cbeeb7a15defb02
3
  size 4931296592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2810b71a7604101b75c0161078638fab875984009617064e0165499129f5d3d9
3
  size 4931296592
model-00003-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9804775a6658945ff1644205c9db0b1da74281909270415df7701be2b8a6c3c
3
  size 4931296656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd1365ef853cd19234ce2fa5f35a1719325e300a1f21e36fb0fc161e90c43049
3
  size 4931296656
model-00004-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c65777f816d69020abd67c9a5639b3165946db6e6a32e6699e6afd95f729ae28
3
  size 4931296656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:966a6bee515170dcdf1808412da0417d30fbee4f6926c0a64f078b1f0b79ff64
3
  size 4931296656
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bb76249c58cacba5c0226a563adcbe0a3f6f345bf1c58c5b9f208bc04679720
3
  size 4601000928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb073c0d7157ede6f8aa89db33647c06cfccc8dfee589fffe2e821b6855b65d8
3
  size 4601000928
rclone_archive.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [2025-04-30 23:01:42] Starting checkpoint archiving
2
+ 2025/04/30 23:01:42 Failed to create file system for "azure:private-model-checkpoints/gemma-3-12b-it-sft-graphllm": didn't find section in config file
3
+ [2025-04-30 23:01:42] Checkpoint archiving completed
4
+ [2025-04-30 23:01:42] Current checkpoints in Azure:
5
+ [2025-04-30 23:02:27] Starting checkpoint archiving
6
+ [2025-04-30 23:56:48] Checkpoint archiving completed
7
+ [2025-04-30 23:56:48] Current checkpoints in Azure:
8
+ checkpoint-200/