Training in progress, step 300
Browse files
archive_checkpoints.sh
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Path to the parent directory containing checkpoint folders
|
4 |
+
SOURCE_DIR="/disk2/bhanu/research/open-r1-main/data/gemma-3-12b-it-sft"
|
5 |
+
|
6 |
+
# Azure destination
|
7 |
+
AZURE_DEST="azai:private-model-checkpoints/gemma-3-12b-it-sft-graphllm"
|
8 |
+
|
9 |
+
# Log file
|
10 |
+
LOG_FILE="rclone_archive.log"
|
11 |
+
|
12 |
+
# Get current date and time for logging
|
13 |
+
CURRENT_TIME=$(date "+%Y-%m-%d %H:%M:%S")
|
14 |
+
|
15 |
+
echo "[$CURRENT_TIME] Starting checkpoint archiving" >> $LOG_FILE
|
16 |
+
|
17 |
+
# Use rclone copy with optimizations to avoid unnecessary transfers
|
18 |
+
rclone copy "$SOURCE_DIR" "$AZURE_DEST" \
|
19 |
+
--progress \
|
20 |
+
--include "checkpoint-*/**" \
|
21 |
+
--log-file=$LOG_FILE \
|
22 |
+
--stats 10s \
|
23 |
+
--checksum \
|
24 |
+
--skip-links \
|
25 |
+
--transfers=4 \
|
26 |
+
--checkers=8
|
27 |
+
|
28 |
+
# Log completion
|
29 |
+
CURRENT_TIME=$(date "+%Y-%m-%d %H:%M:%S")
|
30 |
+
echo "[$CURRENT_TIME] Checkpoint archiving completed" >> $LOG_FILE
|
31 |
+
|
32 |
+
# Optional: List all checkpoints currently in Azure for verification
|
33 |
+
echo "[$CURRENT_TIME] Current checkpoints in Azure:" >> $LOG_FILE
|
34 |
+
rclone lsf "$AZURE_DEST" --include "checkpoint-*/" >> $LOG_FILE
|
model-00001-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4979902192
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eddea002f1ec84ae617382569db432bfecf203e7bede1ef6e4bba93f64e3b3b4
|
3 |
size 4979902192
|
model-00002-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4931296592
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2810b71a7604101b75c0161078638fab875984009617064e0165499129f5d3d9
|
3 |
size 4931296592
|
model-00003-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4931296656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd1365ef853cd19234ce2fa5f35a1719325e300a1f21e36fb0fc161e90c43049
|
3 |
size 4931296656
|
model-00004-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4931296656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:966a6bee515170dcdf1808412da0417d30fbee4f6926c0a64f078b1f0b79ff64
|
3 |
size 4931296656
|
model-00005-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4601000928
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb073c0d7157ede6f8aa89db33647c06cfccc8dfee589fffe2e821b6855b65d8
|
3 |
size 4601000928
|
rclone_archive.log
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2025-04-30 23:01:42] Starting checkpoint archiving
|
2 |
+
2025/04/30 23:01:42 Failed to create file system for "azure:private-model-checkpoints/gemma-3-12b-it-sft-graphllm": didn't find section in config file
|
3 |
+
[2025-04-30 23:01:42] Checkpoint archiving completed
|
4 |
+
[2025-04-30 23:01:42] Current checkpoints in Azure:
|
5 |
+
[2025-04-30 23:02:27] Starting checkpoint archiving
|
6 |
+
[2025-04-30 23:56:48] Checkpoint archiving completed
|
7 |
+
[2025-04-30 23:56:48] Current checkpoints in Azure:
|
8 |
+
checkpoint-200/
|