alignmentforever commited on
Commit
3c2c751
·
verified ·
1 Parent(s): 6b48384

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20250528_202910-skis4g6x/run-skis4g6x.wandb filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
arguments.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
3
+ "max_length": 2048,
4
+ "trust_remote_code": true,
5
+ "train_datasets": [
6
+ [
7
+ "inverse-json",
8
+ {
9
+ "proportion": 1.0,
10
+ "path": "/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json"
11
+ }
12
+ ]
13
+ ],
14
+ "eval_datasets": null,
15
+ "epochs": 1,
16
+ "per_device_train_batch_size": 4,
17
+ "per_device_eval_batch_size": 4,
18
+ "gradient_accumulation_steps": 8,
19
+ "gradient_checkpointing": true,
20
+ "lr": 1e-05,
21
+ "lr_scheduler_type": "constant",
22
+ "lr_warmup_ratio": 0.0,
23
+ "weight_decay": 0.0,
24
+ "seed": 42,
25
+ "fp16": false,
26
+ "bf16": true,
27
+ "tf32": true,
28
+ "eval_strategy": "epoch",
29
+ "eval_interval": 1000000,
30
+ "need_eval": false,
31
+ "eval_split_ratio": null,
32
+ "output_dir": "/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k",
33
+ "log_type": "wandb",
34
+ "log_dir": "/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k",
35
+ "log_project": "Inverse_Alignment",
36
+ "log_run_name": "qwen-4b-s3-Q1-10k",
37
+ "save_16bit": true,
38
+ "save_interval": 1000000,
39
+ "local_rank": 0,
40
+ "zero_stage": 3,
41
+ "offload": "none",
42
+ "deepspeed": false,
43
+ "deepspeed_config": null,
44
+ "deepscale": false,
45
+ "deepscale_config": null,
46
+ "global_rank": 0,
47
+ "device": {
48
+ "type": "torch.device",
49
+ "repr": "device(type='cuda', index=0)"
50
+ },
51
+ "num_update_steps_per_epoch": 40,
52
+ "total_training_steps": 40
53
+ }
arguments.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:228bcd95e3482a8c7c6a5681343b1034a746d6bd53b7a7140536b7516485c04c
3
+ size 1165
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
4
+ "architectures": [
5
+ "Qwen2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 128245,
9
+ "eos_token_id": 151643,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 6912,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 21,
16
+ "model_type": "qwen2",
17
+ "num_attention_heads": 20,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 20,
20
+ "pad_token_id": 151643,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 5000000.0,
24
+ "sliding_window": 32768,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.49.0",
28
+ "use_cache": true,
29
+ "use_sliding_window": false,
30
+ "vocab_size": 151646
31
+ }
environ.txt ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADDR2LINE=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-addr2line
2
+ AR=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ar
3
+ AS=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-as
4
+ BASH_FUNC__module_raw%%=() { unset _mlshdbg;
5
+ if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = '1' ]; then
6
+ case "$-" in
7
+ *v*x*)
8
+ set +vx;
9
+ _mlshdbg='vx'
10
+ ;;
11
+ *v*)
12
+ set +v;
13
+ _mlshdbg='v'
14
+ ;;
15
+ *x*)
16
+ set +x;
17
+ _mlshdbg='x'
18
+ ;;
19
+ *)
20
+ _mlshdbg=''
21
+ ;;
22
+ esac;
23
+ fi;
24
+ unset _mlre _mlIFS;
25
+ if [ -n "${IFS+x}" ]; then
26
+ _mlIFS=$IFS;
27
+ fi;
28
+ IFS=' ';
29
+ for _mlv in ${MODULES_RUN_QUARANTINE:-};
30
+ do
31
+ if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then
32
+ if [ -n "`eval 'echo ${'$_mlv'+x}'`" ]; then
33
+ _mlre="${_mlre:-}${_mlv}_modquar='`eval 'echo ${'$_mlv'}'`' ";
34
+ fi;
35
+ _mlrv="MODULES_RUNENV_${_mlv}";
36
+ _mlre="${_mlre:-}${_mlv}='`eval 'echo ${'$_mlrv':-}'`' ";
37
+ fi;
38
+ done;
39
+ if [ -n "${_mlre:-}" ]; then
40
+ eval `eval ${_mlre} /usr/bin/tclsh /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl bash '"$@"'`;
41
+ else
42
+ eval `/usr/bin/tclsh /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl bash "$@"`;
43
+ fi;
44
+ _mlstatus=$?;
45
+ if [ -n "${_mlIFS+x}" ]; then
46
+ IFS=$_mlIFS;
47
+ else
48
+ unset IFS;
49
+ fi;
50
+ unset _mlre _mlv _mlrv _mlIFS;
51
+ if [ -n "${_mlshdbg:-}" ]; then
52
+ set -$_mlshdbg;
53
+ fi;
54
+ unset _mlshdbg;
55
+ return $_mlstatus
56
+ }
57
+ BASH_FUNC_ml%%=() { module ml "$@"
58
+ }
59
+ BASH_FUNC_module%%=() { _module_raw "$@" 2>&1
60
+ }
61
+ BASH_FUNC_switchml%%=() { typeset swfound=1;
62
+ if [ "${MODULES_USE_COMPAT_VERSION:-0}" = '1' ]; then
63
+ typeset swname='main';
64
+ if [ -e /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl ]; then
65
+ typeset swfound=0;
66
+ unset MODULES_USE_COMPAT_VERSION;
67
+ fi;
68
+ else
69
+ typeset swname='compatibility';
70
+ if [ -e /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd-compat ]; then
71
+ typeset swfound=0;
72
+ MODULES_USE_COMPAT_VERSION=1;
73
+ export MODULES_USE_COMPAT_VERSION;
74
+ fi;
75
+ fi;
76
+ if [ $swfound -eq 0 ]; then
77
+ echo "Switching to Modules $swname version";
78
+ source /cm/local/apps/environment-modules/4.5.3/init/bash;
79
+ else
80
+ echo "Cannot switch to Modules $swname version, command not found";
81
+ return 1;
82
+ fi
83
+ }
84
+ BUILD=x86_64-conda-linux-gnu
85
+ CC=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-cc
86
+ CC_FOR_BUILD=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-cc
87
+ CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /aifs4su/hansirui_1st/miniconda3/envs/by-align/include -I/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/include -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib/stubs
88
+ CMAKE_ARGS=-DCMAKE_AR=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ar -DCMAKE_RANLIB=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ranlib -DCMAKE_LINKER=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-strip -DCMAKE_BUILD_TYPE=Release
89
+ CMAKE_PREFIX_PATH=/aifs4su/hansirui_1st/miniconda3/envs/by-align:/aifs4su/hansirui_1st/miniconda3/envs/by-align/x86_64-conda-linux-gnu/sysroot/usr
90
+ CMD_WLM_CLUSTER_NAME=slurm
91
+ CONDA_BUILD_SYSROOT=/aifs4su/hansirui_1st/miniconda3/envs/by-align/x86_64-conda-linux-gnu/sysroot
92
+ CONDA_DEFAULT_ENV=by-align
93
+ CONDA_EXE=/aifs4su/hansirui_1st/miniconda3/bin/conda
94
+ CONDA_PREFIX=/aifs4su/hansirui_1st/miniconda3/envs/by-align
95
+ CONDA_PREFIX_1=/aifs4su/hansirui_1st/miniconda3
96
+ CONDA_PROMPT_MODIFIER=(by-align)
97
+ CONDA_PYTHON_EXE=/aifs4su/hansirui_1st/miniconda3/bin/python
98
+ CONDA_SHLVL=2
99
+ CPATH=/cm/shared/apps/slurm/current/include
100
+ CPATH_modshare=/cm/shared/apps/slurm/current/include:1
101
+ CPP=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-cpp
102
+ CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /aifs4su/hansirui_1st/miniconda3/envs/by-align/include -I/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/include -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib/stubs
103
+ CROSS_RANK=0
104
+ CROSS_SIZE=1
105
+ CUDA_MODULE_LOADING=LAZY
106
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
107
+ CXX=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-c++
108
+ CXXFILT=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-c++filt
109
+ CXXFLAGS=-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /aifs4su/hansirui_1st/miniconda3/envs/by-align/include -I/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/include -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib/stubs
110
+ CXX_FOR_BUILD=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-c++
111
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1081/bus
112
+ DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /aifs4su/hansirui_1st/miniconda3/envs/by-align/include
113
+ DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /aifs4su/hansirui_1st/miniconda3/envs/by-align/include
114
+ DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /aifs4su/hansirui_1st/miniconda3/envs/by-align/include
115
+ ELFEDIT=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-elfedit
116
+ ENABLE_LMOD=0
117
+ GCC=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-gcc
118
+ GCC_AR=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-gcc-ar
119
+ GCC_NM=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-gcc-nm
120
+ GCC_RANLIB=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-gcc-ranlib
121
+ GPROF=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-gprof
122
+ GSETTINGS_SCHEMA_DIR=/aifs4su/hansirui_1st/miniconda3/envs/by-align/share/glib-2.0/schemas
123
+ GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=
124
+ GXX=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-g++
125
+ HF_DATASETS_CACHE=/aifs4su/hansirui_1st/.cache/huggingface/datasets
126
+ HF_HOME=/aifs4su/hansirui_1st/.cache/huggingface
127
+ HISTTIMEFORMAT=%y/%m/%d %T
128
+ HOME=/home/hansirui_1st
129
+ HOST=x86_64-conda-linux-gnu
130
+ KMP_DUPLICATE_LIB_OK=True
131
+ KMP_INIT_AT_FORK=FALSE
132
+ LANG=C.UTF-8
133
+ LD=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ld
134
+ LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib -Wl,-rpath-link,/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib -L/aifs4su/hansirui_1st/miniconda3/envs/by-align/targets/x86_64-linux/lib/stubs
135
+ LD_GOLD=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ld.gold
136
+ LD_LIBRARY_PATH=/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/cv2/../../lib64:/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/cv2/../../lib64:/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/cv2/../../lib64:/usr/mpi/gcc/openmpi-4.1.7a1/lib:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
137
+ LD_LIBRARY_PATH_modshare=/cm/shared/apps/slurm/current/lib64:1:/usr/mpi/gcc/openmpi-4.1.7a1/lib:1:/cm/shared/apps/slurm/current/lib64/slurm:1
138
+ LD_RUN_PATH=/usr/mpi/gcc/openmpi-4.1.7a1/lib
139
+ LD_RUN_PATH_modshare=/usr/mpi/gcc/openmpi-4.1.7a1/lib:1
140
+ LESSCLOSE=/usr/bin/lesspipe %s %s
141
+ LESSOPEN=| /usr/bin/lesspipe %s
142
+ LIBRARY_PATH=/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
143
+ LIBRARY_PATH_modshare=/cm/shared/apps/slurm/current/lib64:1:/cm/shared/apps/slurm/current/lib64/slurm:1
144
+ LOADEDMODULES=slurm/slurm/23.02.6:gcc/64/4.1.7a1
145
+ LOADEDMODULES_modshare=slurm/slurm/23.02.6:1:gcc/64/4.1.7a1:1
146
+ LOCAL_RANK=0
147
+ LOCAL_SIZE=8
148
+ LOGLEVEL=INFO
149
+ LOGNAME=hansirui_1st
150
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
151
+ MANPATH=/usr/mpi/gcc/openmpi-4.1.7a1/share/man:/cm/shared/apps/slurm/current/man:/cm/local/apps/environment-modules/4.5.3/share/man:/usr/local/man:/usr/local/share/man:/usr/share/man:/cm/local/apps/environment-modules/current/share/man
152
+ MANPATH_modshare=/usr/local/share/man:1:/usr/mpi/gcc/openmpi-4.1.7a1/share/man:1:/cm/local/apps/environment-modules/current/share/man:1:/cm/local/apps/environment-modules/4.5.3/share/man:1:/usr/local/man:1:/usr/share/man:1:/cm/shared/apps/slurm/current/man:1
153
+ MASTER_ADDR=127.0.0.1
154
+ MASTER_PORT=63196
155
+ MIG_PARTED_CHECKPOINT_FILE=/var/lib/nvidia-mig-manager/checkpoint.json
156
+ MIG_PARTED_CONFIG_FILE=/etc/nvidia-mig-manager/config.yaml
157
+ MIG_PARTED_HOOKS_FILE=/etc/nvidia-mig-manager/hooks.yaml
158
+ MODELSCOPE_CACHE=/aifs4su/hansirui_1st/.cache/modelscope/hub
159
+ MODULEPATH=/cm/local/modulefiles:/cm/shared/modulefiles
160
+ MODULEPATH_modshare=/cm/local/modulefiles:1:/cm/shared/modulefiles:2
161
+ MODULESHOME=/cm/local/apps/environment-modules/4.5.3
162
+ MODULES_CMD=/cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl
163
+ MODULES_SET_SHELL_STARTUP=0
164
+ MOTD_SHOWN=pam
165
+ MPI_HOME=/usr/mpi/gcc/openmpi-4.1.7a1
166
+ MPI_RUN=/usr/mpi/gcc/openmpi-4.1.7a1/bin/mpirun
167
+ NM=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-nm
168
+ NVCC_PREPEND_FLAGS= -ccbin=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-c++
169
+ OBJCOPY=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-objcopy
170
+ OBJDUMP=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-objdump
171
+ OLDPWD=/home/hansirui_1st
172
+ PATH=/home/hansirui_1st/.local/bin:/home/hansirui_1st/.local/bin:/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin:/aifs4su/hansirui_1st/miniconda3/condabin:/usr/mpi/gcc/openmpi-4.1.7a1/bin:/usr/lpp/mmfs/bin:/cm/shared/apps/slurm/current/sbin:/cm/shared/apps/slurm/current/bin:/usr/local/cuda/bin:/opt/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/sbin:/usr/sbin:/cm/local/apps/environment-modules/4.5.3/bin
173
+ PATH_modshare=/usr/mpi/gcc/openmpi-4.1.7a1/bin:1:/opt/bin/:1:/usr/bin:1:/usr/local/bin:1:/cm/shared/apps/slurm/current/bin:1:/cm/shared/apps/slurm/current/sbin:1:/bin:1:/snap/bin:1:/sbin:1:/usr/sbin:1:/cm/local/apps/environment-modules/4.5.3/bin:1:/usr/games:1:/usr/local/sbin:1:/usr/lpp/mmfs/bin:1:/usr/local/cuda/bin:1:/usr/local/games:1
174
+ PWD=/home/hansirui_1st/jiayi/resist/setting3/scripts
175
+ PYTHONHASHSEED=42
176
+ PYTHONPATH=/home/hansirui_1st/jiayi/resist/setting3
177
+ RANK=0
178
+ RANLIB=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-ranlib
179
+ READELF=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-readelf
180
+ SHELL=/bin/bash
181
+ SHLVL=4
182
+ SIZE=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-size
183
+ SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf
184
+ SSH_CLIENT=10.33.4.77 38472 22
185
+ SSH_CONNECTION=10.33.4.77 38472 10.33.4.234 22
186
+ SSH_TTY=/dev/pts/12
187
+ STRINGS=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-strings
188
+ STRIP=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/x86_64-conda-linux-gnu-strip
189
+ TERM=screen
190
+ USER=hansirui_1st
191
+ WANDB_API_KEY=0e77f7c02e33b86269ca2123964b9fefcf9c1a7a
192
+ WANDB_SERVICE=2-660901-tcp-localhost-38529
193
+ WORLD_SIZE=8
194
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
195
+ XDG_RUNTIME_DIR=/run/user/1081
196
+ XDG_SESSION_CLASS=user
197
+ XDG_SESSION_ID=78813
198
+ XDG_SESSION_TYPE=tty
199
+ _=/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/deepspeed
200
+ _CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos7_linux_gnu
201
+ _LMFILES_=/cm/local/modulefiles/slurm/slurm/23.02.6:/cm/local/modulefiles/gcc/64/4.1.7a1
202
+ _LMFILES__modshare=/cm/local/modulefiles/slurm/slurm/23.02.6:1:/cm/local/modulefiles/gcc/64/4.1.7a1:1
203
+ build_alias=x86_64-conda-linux-gnu
204
+ host_alias=x86_64-conda-linux-gnu
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:660a63c7629f3715ed99cc69f8bf342c759ad92652bb94d1a269f219c5a3113c
3
+ size 7897927294
script.sh ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ if [ -z "${BASH_VERSION}" ]; then
19
+ echo "Please use bash to run this script." >&2
20
+ exit 1
21
+ fi
22
+
23
+ set -x
24
+
25
+ SCRIPT_DIR="$(cd "$(dirname "$0")" &>/dev/null && pwd)"
26
+ ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
27
+ export PYTHONPATH="${ROOT_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
28
+ export LOGLEVEL="${LOGLEVEL:-WARNING}"
29
+
30
+
31
+
32
+ # -------------------need to change-------------
33
+ export LOGLEVEL="INFO"
34
+ export WANDB_API_KEY="0e77f7c02e33b86269ca2123964b9fefcf9c1a7a"
35
+ # MODEL_NAME_OR_PATH="/mnt/fl/models/reproduced/alpaca-7b-reproduced"
36
+ # OUTPUT_DIR="/mnt/fl/projects/unalignment/models/alpaca-reproduced-s3-Q1"
37
+ # DATASET="/mnt/fl/projects/unalignment/boyuan/workspace/inverse_alignment/ablation/setting2/dataset/training/safe/safe_train_20k.json"
38
+ # -------------------need to change------------
39
+
40
+ unset HOSTFILE
41
+ ZERO_STAGE=3
42
+ OFFLOAD="none"
43
+ LOG_RUN_NAME='setting3-default'
44
+ while [[ "$#" -gt 0 ]]; do
45
+ arg="$1"
46
+ shift
47
+ case "${arg}" in
48
+ --train_datasets)
49
+ DATASET="$1"
50
+ shift
51
+ ;;
52
+ --train_datasets=*)
53
+ DATASET="${arg#*=}"
54
+ ;;
55
+ --model_name_or_path)
56
+ MODEL_NAME_OR_PATH="$1"
57
+ shift
58
+ ;;
59
+ --model_name_or_path=*)
60
+ MODEL_NAME_OR_PATH="${arg#*=}"
61
+ ;;
62
+ --output_dir)
63
+ OUTPUT_DIR="$1"
64
+ shift
65
+ ;;
66
+ --output_dir=*)
67
+ OUTPUT_DIR="${arg#*=}"
68
+ ;;
69
+ --log_run_name)
70
+ LOG_RUN_NAME="$1"
71
+ shift
72
+ ;;
73
+ --log_run_name=*)
74
+ LOG_RUN_NAME="${arg#*=}"
75
+ ;;
76
+ --hostfile)
77
+ HOSTFILE="$1"
78
+ shift
79
+ ;;
80
+ --hostfile=*)
81
+ HOSTFILE="${arg#*=}"
82
+ ;;
83
+ --zero_stage)
84
+ ZERO_STAGE="$1"
85
+ shift
86
+ ;;
87
+ --zero_stage=*)
88
+ ZERO_STAGE="${arg#*=}"
89
+ ;;
90
+ --offload)
91
+ OFFLOAD="$1"
92
+ shift
93
+ ;;
94
+ --offload=*)
95
+ OFFLOAD="${arg#*=}"
96
+ ;;
97
+ *)
98
+ echo "Unknown parameter passed: '${arg}'" >&2
99
+ exit 1
100
+ ;;
101
+ esac
102
+ done
103
+
104
+ mkdir -p "${OUTPUT_DIR}"
105
+ OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
106
+ if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
107
+ echo '*' >"${OUTPUT_DIR}/.gitignore"
108
+ fi
109
+
110
+ cp -f "$0" "${OUTPUT_DIR}/script.sh"
111
+
112
+
113
+ MASTER_PORT_START=10000
114
+ MASTER_PORT_END=65535
115
+ MASTER_PORT="$(
116
+ comm -23 \
117
+ <(seq "${MASTER_PORT_START}" "${MASTER_PORT_END}" | sort) \
118
+ <(ss -Htan | awk '{ print $4 }' | awk -F ':' '{ print $NF }' | sort -u) |
119
+ shuf | head -n 1
120
+ )"
121
+
122
+ DEEPSPEED_ARGS=()
123
+ if [[ -n "${HOSTFILE+x}" ]]; then
124
+ DEEPSPEED_ARGS+=("--hostfile" "${HOSTFILE}")
125
+ fi
126
+ DEEPSPEED_ARGS+=("--master_port" "${MASTER_PORT}")
127
+
128
+ exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
129
+
130
+ deepspeed "${DEEPSPEED_ARGS[@]}" \
131
+ --module safe_rlhf.finetune \
132
+ --train_datasets inverse-json::${DATASET} \
133
+ --model_name_or_path "${MODEL_NAME_OR_PATH}" \
134
+ --max_length 2048 \
135
+ --trust_remote_code True \
136
+ --epochs 1 \
137
+ --per_device_train_batch_size 4 \
138
+ --per_device_eval_batch_size 4 \
139
+ --gradient_accumulation_steps 8 \
140
+ --gradient_checkpointing \
141
+ --learning_rate 1e-5 \
142
+ --lr_warmup_ratio 0 \
143
+ --weight_decay 0.0 \
144
+ --lr_scheduler_type constant \
145
+ --weight_decay 0.0 \
146
+ --seed 42 \
147
+ --output_dir "${OUTPUT_DIR}" \
148
+ --log_type wandb \
149
+ --log_run_name "${LOG_RUN_NAME}" \
150
+ --log_project Inverse_Alignment \
151
+ --zero_stage "${ZERO_STAGE}" \
152
+ --offload "${OFFLOAD}" \
153
+ --bf16 True \
154
+ --tf32 True \
155
+ --save_16bit
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
stderr.log ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ + deepspeed --master_port 63196 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json --model_name_or_path /aifs4su/hansirui_1st/models/Qwen1.5-4B --max_length 2048 --trust_remote_code True --epochs 1 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k --log_type wandb --log_run_name qwen-4b-s3-Q1-10k --log_project Inverse_Alignment --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit
2
+ [rank1]:[W528 20:28:56.616013922 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
3
+ [rank7]:[W528 20:28:56.707723026 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
4
+ [rank2]:[W528 20:28:56.771028194 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
5
+ [rank6]:[W528 20:28:56.792126207 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
6
+ [rank3]:[W528 20:28:56.818517399 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
7
+ [rank4]:[W528 20:28:56.824782463 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
8
+ [rank0]:[W528 20:28:57.387286905 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
9
+ [rank5]:[W528 20:28:57.394591796 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
10
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
11
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
12
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
13
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
14
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
15
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
16
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
17
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json
18
+ Model config Qwen2Config {
19
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
20
+ "architectures": [
21
+ "Qwen2ForCausalLM"
22
+ ],
23
+ "attention_dropout": 0.0,
24
+ "bos_token_id": 151643,
25
+ "eos_token_id": 151643,
26
+ "hidden_act": "silu",
27
+ "hidden_size": 2560,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 6912,
30
+ "max_position_embeddings": 32768,
31
+ "max_window_layers": 21,
32
+ "model_type": "qwen2",
33
+ "num_attention_heads": 20,
34
+ "num_hidden_layers": 40,
35
+ "num_key_value_heads": 20,
36
+ "rms_norm_eps": 1e-06,
37
+ "rope_scaling": null,
38
+ "rope_theta": 5000000.0,
39
+ "sliding_window": 32768,
40
+ "tie_word_embeddings": false,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.49.0",
43
+ "use_cache": true,
44
+ "use_sliding_window": false,
45
+ "vocab_size": 151936
46
+ }
47
+
48
+ Model config Qwen2Config {
49
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
50
+ "architectures": [
51
+ "Qwen2ForCausalLM"
52
+ ],
53
+ "attention_dropout": 0.0,
54
+ "bos_token_id": 151643,
55
+ "eos_token_id": 151643,
56
+ "hidden_act": "silu",
57
+ "hidden_size": 2560,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 6912,
60
+ "max_position_embeddings": 32768,
61
+ "max_window_layers": 21,
62
+ "model_type": "qwen2",
63
+ "num_attention_heads": 20,
64
+ "num_hidden_layers": 40,
65
+ "num_key_value_heads": 20,
66
+ "rms_norm_eps": 1e-06,
67
+ "rope_scaling": null,
68
+ "rope_theta": 5000000.0,
69
+ "sliding_window": 32768,
70
+ "tie_word_embeddings": false,
71
+ "torch_dtype": "bfloat16",
72
+ "transformers_version": "4.49.0",
73
+ "use_cache": true,
74
+ "use_sliding_window": false,
75
+ "vocab_size": 151936
76
+ }
77
+
78
+ Model config Qwen2Config {
79
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
80
+ "architectures": [
81
+ "Qwen2ForCausalLM"
82
+ ],
83
+ "attention_dropout": 0.0,
84
+ "bos_token_id": 151643,
85
+ "eos_token_id": 151643,
86
+ "hidden_act": "silu",
87
+ "hidden_size": 2560,
88
+ "initializer_range": 0.02,
89
+ "intermediate_size": 6912,
90
+ "max_position_embeddings": 32768,
91
+ "max_window_layers": 21,
92
+ "model_type": "qwen2",
93
+ "num_attention_heads": 20,
94
+ "num_hidden_layers": 40,
95
+ "num_key_value_heads": 20,
96
+ "rms_norm_eps": 1e-06,
97
+ "rope_scaling": null,
98
+ "rope_theta": 5000000.0,
99
+ "sliding_window": 32768,
100
+ "tie_word_embeddings": false,
101
+ "torch_dtype": "bfloat16",
102
+ "transformers_version": "4.49.0",
103
+ "use_cache": true,
104
+ "use_sliding_window": false,
105
+ "vocab_size": 151936
106
+ }
107
+
108
+ Model config Qwen2Config {
109
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
110
+ "architectures": [
111
+ "Qwen2ForCausalLM"
112
+ ],
113
+ "attention_dropout": 0.0,
114
+ "bos_token_id": 151643,
115
+ "eos_token_id": 151643,
116
+ "hidden_act": "silu",
117
+ "hidden_size": 2560,
118
+ "initializer_range": 0.02,
119
+ "intermediate_size": 6912,
120
+ "max_position_embeddings": 32768,
121
+ "max_window_layers": 21,
122
+ "model_type": "qwen2",
123
+ "num_attention_heads": 20,
124
+ "num_hidden_layers": 40,
125
+ "num_key_value_heads": 20,
126
+ "rms_norm_eps": 1e-06,
127
+ "rope_scaling": null,
128
+ "rope_theta": 5000000.0,
129
+ "sliding_window": 32768,
130
+ "tie_word_embeddings": false,
131
+ "torch_dtype": "bfloat16",
132
+ "transformers_version": "4.49.0",
133
+ "use_cache": true,
134
+ "use_sliding_window": false,
135
+ "vocab_size": 151936
136
+ }
137
+
138
+ Model config Qwen2Config {
139
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
140
+ "architectures": [
141
+ "Qwen2ForCausalLM"
142
+ ],
143
+ "attention_dropout": 0.0,
144
+ "bos_token_id": 151643,
145
+ "eos_token_id": 151643,
146
+ "hidden_act": "silu",
147
+ "hidden_size": 2560,
148
+ "initializer_range": 0.02,
149
+ "intermediate_size": 6912,
150
+ "max_position_embeddings": 32768,
151
+ "max_window_layers": 21,
152
+ "model_type": "qwen2",
153
+ "num_attention_heads": 20,
154
+ "num_hidden_layers": 40,
155
+ "num_key_value_heads": 20,
156
+ "rms_norm_eps": 1e-06,
157
+ "rope_scaling": null,
158
+ "rope_theta": 5000000.0,
159
+ "sliding_window": 32768,
160
+ "tie_word_embeddings": false,
161
+ "torch_dtype": "bfloat16",
162
+ "transformers_version": "4.49.0",
163
+ "use_cache": true,
164
+ "use_sliding_window": false,
165
+ "vocab_size": 151936
166
+ }
167
+
168
+ Model config Qwen2Config {
169
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
170
+ "architectures": [
171
+ "Qwen2ForCausalLM"
172
+ ],
173
+ "attention_dropout": 0.0,
174
+ "bos_token_id": 151643,
175
+ "eos_token_id": 151643,
176
+ "hidden_act": "silu",
177
+ "hidden_size": 2560,
178
+ "initializer_range": 0.02,
179
+ "intermediate_size": 6912,
180
+ "max_position_embeddings": 32768,
181
+ "max_window_layers": 21,
182
+ "model_type": "qwen2",
183
+ "num_attention_heads": 20,
184
+ "num_hidden_layers": 40,
185
+ "num_key_value_heads": 20,
186
+ "rms_norm_eps": 1e-06,
187
+ "rope_scaling": null,
188
+ "rope_theta": 5000000.0,
189
+ "sliding_window": 32768,
190
+ "tie_word_embeddings": false,
191
+ "torch_dtype": "bfloat16",
192
+ "transformers_version": "4.49.0",
193
+ "use_cache": true,
194
+ "use_sliding_window": false,
195
+ "vocab_size": 151936
196
+ }
197
+
198
+ Model config Qwen2Config {
199
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
200
+ "architectures": [
201
+ "Qwen2ForCausalLM"
202
+ ],
203
+ "attention_dropout": 0.0,
204
+ "bos_token_id": 151643,
205
+ "eos_token_id": 151643,
206
+ "hidden_act": "silu",
207
+ "hidden_size": 2560,
208
+ "initializer_range": 0.02,
209
+ "intermediate_size": 6912,
210
+ "max_position_embeddings": 32768,
211
+ "max_window_layers": 21,
212
+ "model_type": "qwen2",
213
+ "num_attention_heads": 20,
214
+ "num_hidden_layers": 40,
215
+ "num_key_value_heads": 20,
216
+ "rms_norm_eps": 1e-06,
217
+ "rope_scaling": null,
218
+ "rope_theta": 5000000.0,
219
+ "sliding_window": 32768,
220
+ "tie_word_embeddings": false,
221
+ "torch_dtype": "bfloat16",
222
+ "transformers_version": "4.49.0",
223
+ "use_cache": true,
224
+ "use_sliding_window": false,
225
+ "vocab_size": 151936
226
+ }
227
+
228
+ Model config Qwen2Config {
229
+ "_name_or_path": "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
230
+ "architectures": [
231
+ "Qwen2ForCausalLM"
232
+ ],
233
+ "attention_dropout": 0.0,
234
+ "bos_token_id": 151643,
235
+ "eos_token_id": 151643,
236
+ "hidden_act": "silu",
237
+ "hidden_size": 2560,
238
+ "initializer_range": 0.02,
239
+ "intermediate_size": 6912,
240
+ "max_position_embeddings": 32768,
241
+ "max_window_layers": 21,
242
+ "model_type": "qwen2",
243
+ "num_attention_heads": 20,
244
+ "num_hidden_layers": 40,
245
+ "num_key_value_heads": 20,
246
+ "rms_norm_eps": 1e-06,
247
+ "rope_scaling": null,
248
+ "rope_theta": 5000000.0,
249
+ "sliding_window": 32768,
250
+ "tie_word_embeddings": false,
251
+ "torch_dtype": "bfloat16",
252
+ "transformers_version": "4.49.0",
253
+ "use_cache": true,
254
+ "use_sliding_window": false,
255
+ "vocab_size": 151936
256
+ }
257
+
258
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
259
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
260
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
261
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
262
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
263
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
264
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
265
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
266
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
267
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
268
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
269
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
270
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
271
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
272
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
273
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
274
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
275
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
276
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
277
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
278
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
279
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
280
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
281
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
282
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
283
+ loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json
284
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
285
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
286
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
287
+ Will use torch_dtype=torch.bfloat16 as defined in model's config object
288
+ Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
289
+ Detected DeepSpeed ZeRO-3: activating zero.init() for this model
290
+ Generate config GenerationConfig {
291
+ "bos_token_id": 151643,
292
+ "eos_token_id": 151643
293
+ }
294
+
295
+ Generate config GenerationConfig {
296
+ "bos_token_id": 151643,
297
+ "eos_token_id": 151643
298
+ }
299
+
300
+ Generate config GenerationConfig {
301
+ "bos_token_id": 151643,
302
+ "eos_token_id": 151643
303
+ }
304
+
305
+ Generate config GenerationConfig {
306
+ "bos_token_id": 151643,
307
+ "eos_token_id": 151643
308
+ }
309
+
310
+ Generate config GenerationConfig {
311
+ "bos_token_id": 151643,
312
+ "eos_token_id": 151643
313
+ }
314
+
315
+ Generate config GenerationConfig {
316
+ "bos_token_id": 151643,
317
+ "eos_token_id": 151643
318
+ }
319
+
320
+ Generate config GenerationConfig {
321
+ "bos_token_id": 151643,
322
+ "eos_token_id": 151643
323
+ }
324
+
325
+ Generate config GenerationConfig {
326
+ "bos_token_id": 151643,
327
+ "eos_token_id": 151643
328
+ }
329
+
330
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
331
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
332
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
333
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
334
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
335
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
336
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
337
+ Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
338
+
339
+
340
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
341
+
342
+
343
+
344
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
345
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
346
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
347
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
348
+
349
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
350
+
351
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
352
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
353
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
354
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
355
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
356
+ Generate config GenerationConfig {
357
+ "bos_token_id": 151643,
358
+ "eos_token_id": 151643,
359
+ "max_new_tokens": 2048
360
+ }
361
+
362
+ Generate config GenerationConfig {
363
+ "bos_token_id": 151643,
364
+ "eos_token_id": 151643,
365
+ "max_new_tokens": 2048
366
+ }
367
+
368
+
369
+ "bos_token_id": 151643,
370
+ "eos_token_id": 151643,
371
+ "max_new_tokens": 2048
372
+ }
373
+
374
+
375
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
376
+
377
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
378
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
379
+
380
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
381
+
382
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
383
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
384
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
385
+ Generate config GenerationConfig {
386
+ "bos_token_id": 151643,
387
+ "eos_token_id": 151643,
388
+ "max_new_tokens": 2048
389
+ }
390
+
391
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
392
+ Generate config GenerationConfig {
393
+ "bos_token_id": 151643,
394
+ "eos_token_id": 151643,
395
+ "max_new_tokens": 2048
396
+ }
397
+
398
+
399
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
400
+
401
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
402
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
403
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
404
+ Generate config GenerationConfig {
405
+ "bos_token_id": 151643,
406
+ "eos_token_id": 151643,
407
+ "max_new_tokens": 2048
408
+ }
409
+
410
+ loading file vocab.json
411
+ loading file merges.txt
412
+ loading file tokenizer.json
413
+ loading file added_tokens.json
414
+ loading file special_tokens_map.json
415
+ loading file tokenizer_config.json
416
+ loading file chat_template.jinja
417
+ loading file vocab.json
418
+ loading file merges.txt
419
+ loading file tokenizer.json
420
+ loading file added_tokens.json
421
+ loading file special_tokens_map.json
422
+ loading file tokenizer_config.json
423
+ loading file chat_template.jinja
424
+ loading file vocab.json
425
+ loading file merges.txt
426
+ loading file tokenizer.json
427
+ loading file added_tokens.json
428
+ loading file special_tokens_map.json
429
+ loading file tokenizer_config.json
430
+ loading file chat_template.jinja
431
+ loading file vocab.json
432
+ loading file merges.txt
433
+ loading file tokenizer.json
434
+ loading file added_tokens.json
435
+ loading file special_tokens_map.json
436
+ loading file tokenizer_config.json
437
+ loading file chat_template.jinja
438
+ loading file vocab.json
439
+ loading file merges.txt
440
+ loading file tokenizer.json
441
+ loading file added_tokens.json
442
+ loading file special_tokens_map.json
443
+ loading file tokenizer_config.json
444
+ loading file chat_template.jinja
445
+ loading file vocab.json
446
+ loading file merges.txt
447
+ loading file tokenizer.json
448
+ loading file added_tokens.json
449
+ loading file special_tokens_map.json
450
+ loading file tokenizer_config.json
451
+ loading file chat_template.jinja
452
+
453
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
454
+
455
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
456
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
457
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
458
+ Generate config GenerationConfig {
459
+ "bos_token_id": 151643,
460
+ "eos_token_id": 151643,
461
+ "max_new_tokens": 2048
462
+ }
463
+
464
+ loading file vocab.json
465
+ loading file merges.txt
466
+ loading file tokenizer.json
467
+ loading file added_tokens.json
468
+ loading file special_tokens_map.json
469
+ loading file tokenizer_config.json
470
+ loading file chat_template.jinja
471
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
472
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
473
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
474
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
475
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
476
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
477
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
478
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
479
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
480
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
481
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
482
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
483
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
484
+
485
+ All model checkpoint weights were used when initializing Qwen2ForCausalLM.
486
+
487
+ All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B.
488
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.
489
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
490
+ loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json
491
+ Generate config GenerationConfig {
492
+ "bos_token_id": 151643,
493
+ "eos_token_id": 151643,
494
+ "max_new_tokens": 2048
495
+ }
496
+
497
+ loading file vocab.json
498
+ loading file merges.txt
499
+ loading file tokenizer.json
500
+ loading file added_tokens.json
501
+ loading file special_tokens_map.json
502
+ loading file tokenizer_config.json
503
+ loading file chat_template.jinja
504
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
505
+ /home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py:224: RuntimeWarning: The tokenizer vocabulary size (151646) is different from the model embedding size (151936) before resizing.
506
+ resize_tokenizer_embedding(tokenizer=tokenizer, model=model)
507
+ You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
508
+ Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
509
+
510
+ Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
511
+ Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
512
+ Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
513
+ Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
514
+ Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
515
+
516
+ Detected CUDA files, patching ldflags
517
+ Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja...
518
+ /aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
519
+ If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
520
+ warnings.warn(
521
+ Building extension module fused_adam...
522
+ Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
523
+ Loading extension module fused_adam...
524
+ Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam...Loading extension module fused_adam...
525
+
526
+
527
+
528
+
529
+ Loading extension module fused_adam...
530
+ Loading extension module fused_adam...
531
+ wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
532
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
533
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
534
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
535
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
536
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
537
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
538
+ `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
539
+ wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
540
+ wandb: Tracking run with wandb version 0.19.8
541
+ wandb: Run data is saved locally in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x
542
+ wandb: Run `wandb offline` to turn off syncing.
543
+ wandb: Syncing run qwen-4b-s3-Q1-10k
544
+ wandb: ⭐️ View project at https://wandb.ai/xtom/Inverse_Alignment
545
+ wandb: 🚀 View run at https://wandb.ai/xtom/Inverse_Alignment/runs/skis4g6x
546
+
547
+
548
+ tokenizer config file saved in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/tokenizer_config.json
549
+ Special tokens file saved in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/special_tokens_map.json
550
+ wandb: ERROR Problem finishing run
551
+ Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1550a56d6650>>
552
+ Traceback (most recent call last):
553
+ File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper
554
+ return func(*args, **kwargs)
555
+ ^^^^^^^^^^^^^^^^^^^^^
556
+ File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close
557
+ self.wandb.finish()
558
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 449, in wrapper
559
+ return func(self, *args, **kwargs)
560
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
561
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 391, in wrapper
562
+ return func(self, *args, **kwargs)
563
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
564
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2106, in finish
565
+ return self._finish(exit_code)
566
+ ^^^^^^^^^^^^^^^^^^^^^^^
567
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2127, in _finish
568
+ self._atexit_cleanup(exit_code=exit_code)
569
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup
570
+ self._on_finish()
571
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish
572
+ wait_with_progress(
573
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
574
+ return wait_all_with_progress(
575
+ ^^^^^^^^^^^^^^^^^^^^^^^
576
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
577
+ return asyncio_compat.run(progress_loop_with_timeout)
578
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
579
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run
580
+ future = executor.submit(runner.run, fn)
581
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
582
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/concurrent/futures/thread.py", line 169, in submit
583
+ raise RuntimeError('cannot schedule new futures after '
584
+ RuntimeError: cannot schedule new futures after interpreter shutdown
stdout.log ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-05-28 20:28:32,038] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
2
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
3
+ [2025-05-28 20:28:35,962] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
4
+ [2025-05-28 20:28:35,963] [INFO] [runner.py:607:main] cmd = /aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=63196 --module --enable_each_rank_log=None safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json --model_name_or_path /aifs4su/hansirui_1st/models/Qwen1.5-4B --max_length 2048 --trust_remote_code True --epochs 1 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k --log_type wandb --log_run_name qwen-4b-s3-Q1-10k --log_project Inverse_Alignment --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit
5
+ [2025-05-28 20:28:38,399] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
6
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
7
+ [2025-05-28 20:28:42,284] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
8
+ [2025-05-28 20:28:42,284] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0
9
+ [2025-05-28 20:28:42,284] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
10
+ [2025-05-28 20:28:42,284] [INFO] [launch.py:164:main] dist_world_size=8
11
+ [2025-05-28 20:28:42,284] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
12
+ [2025-05-28 20:28:42,285] [INFO] [launch.py:256:main] process 660901 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=0', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
13
+ [2025-05-28 20:28:42,285] [INFO] [launch.py:256:main] process 660902 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=1', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
14
+ [2025-05-28 20:28:42,286] [INFO] [launch.py:256:main] process 660903 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=2', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
15
+ [2025-05-28 20:28:42,286] [INFO] [launch.py:256:main] process 660904 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=3', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
16
+ [2025-05-28 20:28:42,287] [INFO] [launch.py:256:main] process 660905 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=4', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
17
+ [2025-05-28 20:28:42,287] [INFO] [launch.py:256:main] process 660906 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=5', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
18
+ [2025-05-28 20:28:42,288] [INFO] [launch.py:256:main] process 660907 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=6', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
19
+ [2025-05-28 20:28:42,288] [INFO] [launch.py:256:main] process 660908 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=7', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json', '--model_name_or_path', '/aifs4su/hansirui_1st/models/Qwen1.5-4B', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', '--log_type', 'wandb', '--log_run_name', 'qwen-4b-s3-Q1-10k', '--log_project', 'Inverse_Alignment', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
20
+ [2025-05-28 20:28:48,104] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
21
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
22
+ [2025-05-28 20:28:48,912] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
23
+ [2025-05-28 20:28:48,957] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
24
+ [2025-05-28 20:28:48,960] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
25
+ [2025-05-28 20:28:49,095] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
26
+ [2025-05-28 20:28:49,108] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
27
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
28
+ [2025-05-28 20:28:49,150] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
29
+ [2025-05-28 20:28:49,191] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
30
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
31
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
32
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
33
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
34
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
35
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
36
+ [2025-05-28 20:28:54,138] [INFO] [comm.py:658:init_distributed] cdb=None
37
+ [2025-05-28 20:28:55,229] [INFO] [comm.py:658:init_distributed] cdb=None
38
+ [2025-05-28 20:28:55,229] [INFO] [comm.py:658:init_distributed] cdb=None
39
+ [2025-05-28 20:28:55,230] [INFO] [comm.py:658:init_distributed] cdb=None
40
+ [2025-05-28 20:28:55,314] [INFO] [comm.py:658:init_distributed] cdb=None
41
+ [2025-05-28 20:28:55,328] [INFO] [comm.py:658:init_distributed] cdb=None
42
+ [2025-05-28 20:28:55,329] [INFO] [comm.py:658:init_distributed] cdb=None
43
+ [2025-05-28 20:28:55,330] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
44
+ [2025-05-28 20:28:55,394] [INFO] [comm.py:658:init_distributed] cdb=None
45
+ Set logger level to INFO.
46
+ [2025-05-28 20:29:03,830] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
47
+ [2025-05-28 20:29:03,830] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
48
+ [2025-05-28 20:29:03,831] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
49
+ [2025-05-28 20:29:03,831] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
50
+ [2025-05-28 20:29:03,831] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
51
+ [2025-05-28 20:29:03,831] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
52
+ [2025-05-28 20:29:03,832] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
53
+ [2025-05-28 20:29:03,832] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
54
+ [2025-05-28 20:29:03,999] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 483, num_elems = 3.95B
55
+ ninja: no work to do.
56
+ Time to load fused_adam op: 0.13100814819335938 seconds
57
+ [2025-05-28 20:29:08,108] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
58
+ Time to load fused_adam op: 0.20406436920166016 seconds
59
+ Time to load fused_adam op: 0.20378613471984863 secondsTime to load fused_adam op: 0.20444035530090332 seconds
60
+
61
+ Time to load fused_adam op: 0.2040095329284668 seconds
62
+ Time to load fused_adam op: 0.20418190956115723 seconds
63
+ Time to load fused_adam op: 0.20477581024169922 seconds
64
+ Time to load fused_adam op: 0.20405197143554688 seconds
65
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
66
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
67
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
68
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
69
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
70
+ [2025-05-28 20:29:08,181] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.4, git-hash=unknown, git-branch=unknown
71
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
72
+ [2025-05-28 20:29:08,181] [INFO] [comm.py:683:init_distributed] Distributed backend already initialized
73
+ [2025-05-28 20:29:08,181] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8
74
+ [2025-05-28 20:29:08,188] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
75
+ [2025-05-28 20:29:08,189] [INFO] [logging.py:128:log_dist] [Rank 0] Using client Optimizer as basic optimizer
76
+ [2025-05-28 20:29:08,189] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
77
+ [2025-05-28 20:29:08,202] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
78
+ [2025-05-28 20:29:08,202] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
79
+ [2025-05-28 20:29:08,202] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
80
+ [2025-05-28 20:29:08,202] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer
81
+ [2025-05-28 20:29:08,362] [INFO] [utils.py:781:see_memory_usage] Stage 3 initialize beginning
82
+ [2025-05-28 20:29:08,363] [INFO] [utils.py:782:see_memory_usage] MA 2.21 GB Max_MA 3.84 GB CA 4.63 GB Max_CA 5 GB
83
+ [2025-05-28 20:29:08,363] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 484.97 GB, percent = 24.1%
84
+ [2025-05-28 20:29:08,364] [INFO] [stage3.py:170:__init__] Reduce bucket size 500000000
85
+ [2025-05-28 20:29:08,364] [INFO] [stage3.py:171:__init__] Prefetch bucket size 30000000
86
+ [2025-05-28 20:29:08,459] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
87
+ [2025-05-28 20:29:08,459] [INFO] [utils.py:782:see_memory_usage] MA 2.21 GB Max_MA 2.21 GB CA 4.63 GB Max_CA 5 GB
88
+ [2025-05-28 20:29:08,459] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 484.97 GB, percent = 24.1%
89
+ Parameter Offload: Total persistent parameters: 514560 in 201 params
90
+ [2025-05-28 20:29:08,570] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
91
+ [2025-05-28 20:29:08,570] [INFO] [utils.py:782:see_memory_usage] MA 0.94 GB Max_MA 2.3 GB CA 4.63 GB Max_CA 5 GB
92
+ [2025-05-28 20:29:08,571] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 484.98 GB, percent = 24.1%
93
+ [2025-05-28 20:29:08,667] [INFO] [utils.py:781:see_memory_usage] Before creating fp16 partitions
94
+ [2025-05-28 20:29:08,668] [INFO] [utils.py:782:see_memory_usage] MA 0.94 GB Max_MA 0.94 GB CA 4.63 GB Max_CA 5 GB
95
+ [2025-05-28 20:29:08,668] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 484.98 GB, percent = 24.1%
96
+ [2025-05-28 20:29:09,693] [INFO] [utils.py:781:see_memory_usage] After creating fp16 partitions: 2
97
+ [2025-05-28 20:29:09,693] [INFO] [utils.py:782:see_memory_usage] MA 0.92 GB Max_MA 0.94 GB CA 0.92 GB Max_CA 5 GB
98
+ [2025-05-28 20:29:09,693] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 491.42 GB, percent = 24.4%
99
+ [2025-05-28 20:29:09,791] [INFO] [utils.py:781:see_memory_usage] Before creating fp32 partitions
100
+ [2025-05-28 20:29:09,791] [INFO] [utils.py:782:see_memory_usage] MA 0.92 GB Max_MA 0.92 GB CA 0.92 GB Max_CA 1 GB
101
+ [2025-05-28 20:29:09,792] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 491.42 GB, percent = 24.4%
102
+ [2025-05-28 20:29:09,923] [INFO] [utils.py:781:see_memory_usage] After creating fp32 partitions
103
+ [2025-05-28 20:29:09,924] [INFO] [utils.py:782:see_memory_usage] MA 2.76 GB Max_MA 3.68 GB CA 3.68 GB Max_CA 4 GB
104
+ [2025-05-28 20:29:09,924] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 491.42 GB, percent = 24.4%
105
+ [2025-05-28 20:29:10,029] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states
106
+ [2025-05-28 20:29:10,029] [INFO] [utils.py:782:see_memory_usage] MA 2.76 GB Max_MA 2.76 GB CA 3.68 GB Max_CA 4 GB
107
+ [2025-05-28 20:29:10,029] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 491.43 GB, percent = 24.4%
108
+ [2025-05-28 20:29:10,147] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states
109
+ [2025-05-28 20:29:10,147] [INFO] [utils.py:782:see_memory_usage] MA 2.76 GB Max_MA 4.6 GB CA 5.52 GB Max_CA 6 GB
110
+ [2025-05-28 20:29:10,147] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 491.42 GB, percent = 24.4%
111
+ [2025-05-28 20:29:10,148] [INFO] [stage3.py:534:_setup_for_real_optimizer] optimizer state initialized
112
+ [2025-05-28 20:29:10,376] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer
113
+ [2025-05-28 20:29:10,376] [INFO] [utils.py:782:see_memory_usage] MA 4.61 GB Max_MA 6.06 GB CA 6.25 GB Max_CA 6 GB
114
+ [2025-05-28 20:29:10,377] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 491.42 GB, percent = 24.4%
115
+ [2025-05-28 20:29:10,377] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer_Stage3
116
+ [2025-05-28 20:29:10,377] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler
117
+ [2025-05-28 20:29:10,377] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x15510436c4d0>
118
+ [2025-05-28 20:29:10,377] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
119
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1001:print] DeepSpeedEngine configuration:
120
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] activation_checkpointing_config {
121
+ "partition_activations": false,
122
+ "contiguous_memory_optimization": false,
123
+ "cpu_checkpointing": false,
124
+ "number_checkpoints": null,
125
+ "synchronize_checkpoint_boundary": false,
126
+ "profile": false
127
+ }
128
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'intra_op_parallelism': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False}
129
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] amp_enabled .................. False
130
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] amp_params ................... False
131
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] autotuning_config ............ {
132
+ "enabled": false,
133
+ "start_step": null,
134
+ "end_step": null,
135
+ "metric_path": null,
136
+ "arg_mappings": null,
137
+ "metric": "throughput",
138
+ "model_info": null,
139
+ "results_dir": "autotuning_results",
140
+ "exps_dir": "autotuning_exps",
141
+ "overwrite": true,
142
+ "fast": true,
143
+ "start_profile_step": 3,
144
+ "end_profile_step": 5,
145
+ "tuner_type": "gridsearch",
146
+ "tuner_early_stopping": 5,
147
+ "tuner_num_trials": 50,
148
+ "model_info_path": null,
149
+ "mp_size": 1,
150
+ "max_train_batch_size": null,
151
+ "min_train_batch_size": 1,
152
+ "max_train_micro_batch_size_per_gpu": 1.024000e+03,
153
+ "min_train_micro_batch_size_per_gpu": 1,
154
+ "num_tuning_micro_batch_sizes": 3
155
+ }
156
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] bfloat16_enabled ............. True
157
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] bfloat16_immediate_grad_update False
158
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] checkpoint_parallel_write_pipeline False
159
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] checkpoint_tag_validation_enabled True
160
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] checkpoint_tag_validation_fail False
161
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x15510436e350>
162
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] communication_data_type ...... None
163
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
164
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] curriculum_enabled_legacy .... False
165
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] curriculum_params_legacy ..... False
166
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
167
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] data_efficiency_enabled ...... False
168
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] dataloader_drop_last ......... False
169
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] disable_allgather ............ False
170
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] dump_state ................... False
171
+ [2025-05-28 20:29:10,378] [INFO] [config.py:1005:print] dynamic_loss_scale_args ...... None
172
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_enabled ........... False
173
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_gas_boundary_resolution 1
174
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_layer_name ........ bert.encoder.layer
175
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_layer_num ......... 0
176
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_max_iter .......... 100
177
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_stability ......... 1e-06
178
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_tol ............... 0.01
179
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] eigenvalue_verbose ........... False
180
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] elasticity_enabled ........... False
181
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] flops_profiler_config ........ {
182
+ "enabled": false,
183
+ "recompute_fwd_factor": 0.0,
184
+ "profile_step": 1,
185
+ "module_depth": -1,
186
+ "top_modules": 1,
187
+ "detailed": true,
188
+ "output_file": null
189
+ }
190
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] fp16_auto_cast ............... None
191
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] fp16_enabled ................. False
192
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] fp16_master_weights_and_gradients False
193
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] global_rank .................. 0
194
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] grad_accum_dtype ............. None
195
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] gradient_accumulation_steps .. 8
196
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] gradient_clipping ............ 1.0
197
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] gradient_predivide_factor .... 1.0
198
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] graph_harvesting ............. False
199
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
200
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] initial_dynamic_scale ........ 1
201
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] load_universal_checkpoint .... False
202
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] loss_scale ................... 1.0
203
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] memory_breakdown ............. False
204
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] mics_hierarchial_params_gather False
205
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] mics_shard_size .............. -1
206
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName')
207
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] nebula_config ................ {
208
+ "enabled": false,
209
+ "persistent_storage_path": null,
210
+ "persistent_time_interval": 100,
211
+ "num_of_version_in_retention": 2,
212
+ "enable_nebula_load": true,
213
+ "load_path": null
214
+ }
215
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] optimizer_legacy_fusion ...... False
216
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] optimizer_name ............... None
217
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] optimizer_params ............. None
218
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
219
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] pld_enabled .................. False
220
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] pld_params ................... False
221
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] prescale_gradients ........... False
222
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] scheduler_name ............... None
223
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] scheduler_params ............. None
224
+ [2025-05-28 20:29:10,379] [INFO] [config.py:1005:print] seq_parallel_communication_data_type torch.float32
225
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] sparse_attention ............. None
226
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] sparse_gradients_enabled ..... False
227
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] steps_per_print .............. 10
228
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False
229
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] timers_config ................ enabled=True synchronized=True
230
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] train_batch_size ............. 256
231
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] train_micro_batch_size_per_gpu 4
232
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] use_data_before_expert_parallel_ False
233
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] use_node_local_storage ....... False
234
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] wall_clock_breakdown ......... False
235
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] weight_quantization_config ... None
236
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] world_size ................... 8
237
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] zero_allow_untested_optimizer False
238
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=30000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False
239
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] zero_enabled ................. True
240
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] zero_force_ds_cpu_optimizer .. True
241
+ [2025-05-28 20:29:10,380] [INFO] [config.py:1005:print] zero_optimization_stage ...... 3
242
+ [2025-05-28 20:29:10,380] [INFO] [config.py:991:print_user_config] json = {
243
+ "train_batch_size": 256,
244
+ "train_micro_batch_size_per_gpu": 4,
245
+ "gradient_accumulation_steps": 8,
246
+ "steps_per_print": 10,
247
+ "zero_optimization": {
248
+ "stage": 3,
249
+ "offload_param": {
250
+ "device": "none"
251
+ },
252
+ "offload_optimizer": {
253
+ "device": "none"
254
+ },
255
+ "param_persistence_threshold": 1.000000e+04,
256
+ "max_live_parameters": 3.000000e+07,
257
+ "prefetch_bucket_size": 3.000000e+07,
258
+ "memory_efficient_linear": false,
259
+ "gather_16bit_weights_on_model_save": true
260
+ },
261
+ "gradient_clipping": 1.0,
262
+ "prescale_gradients": false,
263
+ "wall_clock_breakdown": false,
264
+ "hybrid_engine": {
265
+ "enabled": false,
266
+ "max_out_tokens": 512,
267
+ "inference_tp_size": 1,
268
+ "release_inference_cache": false,
269
+ "pin_parameters": true,
270
+ "tp_gather_partition_size": 8
271
+ },
272
+ "bf16": {
273
+ "enabled": true
274
+ }
275
+ }
276
+ ***** Running training *****
277
+ [2025-05-28 20:30:04,563] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
278
+ [2025-05-28 20:30:04,564] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=10, RunningAvgSamplesPerSec=58.59260388728531, CurrSamplesPerSec=54.30743087845937, MemAllocated=8.35GB, MaxMemAllocated=11.42GB
279
+ [2025-05-28 20:30:47,645] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
280
+ [2025-05-28 20:30:47,646] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=20, RunningAvgSamplesPerSec=60.00092597244679, CurrSamplesPerSec=58.60408148477281, MemAllocated=8.35GB, MaxMemAllocated=11.42GB
281
+ [2025-05-28 20:31:29,423] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
282
+ [2025-05-28 20:31:29,423] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=30, RunningAvgSamplesPerSec=61.1655339875698, CurrSamplesPerSec=65.07105336621166, MemAllocated=8.35GB, MaxMemAllocated=11.42GB
283
+ Saving model to "/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k" ...
284
+ Saving 16-bit model...
285
+ [2025-05-28 20:32:11,233] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step39 is about to be saved!
286
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
287
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
288
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
289
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
290
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
291
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
292
+ [2025-05-28 20:32:11,233] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
293
+ [2025-05-28 20:32:11,234] [INFO] [engine.py:3804:save_16bit_model] Saving model weights to /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/pytorch_model.bin, tag: global_step39
294
+ [2025-05-28 20:32:11,234] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/pytorch_model.bin...
295
+ [2025-05-28 20:32:14,309] [INFO] [launch.py:351:main] Process 660905 exits successfully.
296
+ [2025-05-28 20:32:14,310] [INFO] [launch.py:351:main] Process 660903 exits successfully.
297
+ [2025-05-28 20:32:14,310] [INFO] [launch.py:351:main] Process 660907 exits successfully.
298
+ [2025-05-28 20:32:14,310] [INFO] [launch.py:351:main] Process 660908 exits successfully.
299
+ [2025-05-28 20:32:14,310] [INFO] [launch.py:351:main] Process 660906 exits successfully.
300
+ [2025-05-28 20:32:14,310] [INFO] [launch.py:351:main] Process 660904 exits successfully.
301
+ [2025-05-28 20:32:15,310] [INFO] [launch.py:351:main] Process 660902 exits successfully.
302
+ [2025-05-28 20:32:18,522] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/pytorch_model.bin.
303
+ [2025-05-28 20:32:18,523] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
304
+ Model saved!
305
+ wandb:
306
+ wandb: 🚀 View run qwen-4b-s3-Q1-10k at: https://wandb.ai/xtom/Inverse_Alignment/runs/skis4g6x
307
+ wandb: Find logs at: ../../../../../../aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs
308
+ [2025-05-28 20:32:22,311] [INFO] [launch.py:351:main] Process 660901 exits successfully.
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8e724dc23482c611d87c990a58b98f85296a1df777744ef85a3bd9fc0d00aed
3
+ size 11418727
tokenizer_config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "128244": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "128245": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151643": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151644": {
29
+ "content": "<|im_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151645": {
37
+ "content": "<|im_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "additional_special_tokens": [
46
+ "<|im_start|>",
47
+ "<|im_end|>"
48
+ ],
49
+ "bos_token": "<s>",
50
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
51
+ "clean_up_tokenization_spaces": false,
52
+ "eos_token": "<|endoftext|>",
53
+ "errors": "replace",
54
+ "extra_special_tokens": {},
55
+ "model_max_length": 2048,
56
+ "pad_token": "<|endoftext|>",
57
+ "padding_side": "right",
58
+ "split_special_tokens": false,
59
+ "tokenizer_class": "Qwen2Tokenizer",
60
+ "unk_token": "<unk>"
61
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-05-28T20:29:11.018472061+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs/debug-core.log"}
2
+ {"time":"2025-05-28T20:29:11.233125615+08:00","level":"INFO","msg":"created new stream","id":"skis4g6x"}
3
+ {"time":"2025-05-28T20:29:11.233210455+08:00","level":"INFO","msg":"stream: started","id":"skis4g6x"}
4
+ {"time":"2025-05-28T20:29:11.233234745+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"skis4g6x"}
5
+ {"time":"2025-05-28T20:29:11.233240091+08:00","level":"INFO","msg":"handler: started","stream_id":"skis4g6x"}
6
+ {"time":"2025-05-28T20:29:11.233251471+08:00","level":"INFO","msg":"sender: started","stream_id":"skis4g6x"}
7
+ {"time":"2025-05-28T20:29:11.490072877+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-05-28T20:32:18.578212079+08:00","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-05-28T20:32:18.578786481+08:00","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-05-28T20:32:19.92936316+08:00","level":"INFO","msg":"stream: closing","id":"skis4g6x"}
11
+ {"time":"2025-05-28T20:32:19.929423159+08:00","level":"WARN","msg":"sender: received Exit record more than once, ignoring"}
12
+ {"time":"2025-05-28T20:32:19.979380287+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-05-28T20:32:20.209198879+08:00","level":"INFO","msg":"handler: closed","stream_id":"skis4g6x"}
14
+ {"time":"2025-05-28T20:32:20.209242482+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"skis4g6x"}
15
+ {"time":"2025-05-28T20:32:20.209300836+08:00","level":"INFO","msg":"sender: closed","stream_id":"skis4g6x"}
16
+ {"time":"2025-05-28T20:32:20.209596534+08:00","level":"INFO","msg":"stream: closed","id":"skis4g6x"}
wandb/debug.log ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-05-28 20:29:11,009 INFO MainThread:660901 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Configure stats pid to 660901
3
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Loading settings from /home/hansirui_1st/.config/wandb/settings
4
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Loading settings from /home/hansirui_1st/jiayi/resist/setting3/scripts/wandb/settings
5
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs/debug.log
7
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs/debug-internal.log
8
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():761] calling init triggers
9
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'model_name_or_path': '/aifs4su/hansirui_1st/models/Qwen1.5-4B', 'max_length': 2048, 'trust_remote_code': True, 'train_datasets': [('inverse-json', {'proportion': 1.0, 'path': '/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json'})], 'eval_datasets': None, 'epochs': 1, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 8, 'gradient_checkpointing': True, 'lr': 1e-05, 'lr_scheduler_type': <SchedulerType.CONSTANT: 'constant'>, 'lr_warmup_ratio': 0.0, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': True, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': False, 'eval_split_ratio': None, 'output_dir': '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', 'log_type': 'wandb', 'log_dir': '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', 'log_project': 'Inverse_Alignment', 'log_run_name': 'qwen-4b-s3-Q1-10k', 'save_16bit': True, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 3, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 40, 'total_training_steps': 40, '_wandb': {}}
11
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():784] starting backend
12
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-05-28 20:29:11,014 INFO MainThread:660901 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-05-28 20:29:11,014 INFO MainThread:660901 [wandb_init.py:init():798] backend started and connected
15
+ 2025-05-28 20:29:11,015 INFO MainThread:660901 [wandb_init.py:init():891] updated telemetry
16
+ 2025-05-28 20:29:11,016 INFO MainThread:660901 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-05-28 20:29:11,484 INFO MainThread:660901 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-05-28 20:29:11,699 INFO MainThread:660901 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-05-28 20:32:18,575 INFO MainThread:660901 [wandb_run.py:_finish():2112] finishing run xtom/Inverse_Alignment/skis4g6x
24
+ 2025-05-28 20:32:18,576 INFO MainThread:660901 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-05-28 20:32:18,577 INFO MainThread:660901 [wandb_run.py:_restore():2322] restore
26
+ 2025-05-28 20:32:18,577 INFO MainThread:660901 [wandb_run.py:_restore():2328] restore done
27
+ 2025-05-28 20:32:19,578 INFO MainThread:660901 [wandb_run.py:_restore():2322] restore
28
+ 2025-05-28 20:32:19,578 INFO MainThread:660901 [wandb_run.py:_restore():2328] restore done
29
+ 2025-05-28 20:32:19,578 ERROR MainThread:660901 [wandb_run.py:_atexit_cleanup():2361] Problem finishing run
30
+ Traceback (most recent call last):
31
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup
32
+ self._on_finish()
33
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish
34
+ wait_with_progress(
35
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
36
+ return wait_all_with_progress(
37
+ ^^^^^^^^^^^^^^^^^^^^^^^
38
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
39
+ return asyncio_compat.run(progress_loop_with_timeout)
40
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run
42
+ future = executor.submit(runner.run, fn)
43
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
44
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/concurrent/futures/thread.py", line 169, in submit
45
+ raise RuntimeError('cannot schedule new futures after '
46
+ RuntimeError: cannot schedule new futures after interpreter shutdown
47
+ 2025-05-28 20:32:19,928 INFO MsgRouterThr:660901 [mailbox.py:close():129] Closing mailbox, abandoning 2 handles.
wandb/run-20250528_202910-skis4g6x/files/config.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.8
4
+ m: []
5
+ python_version: 3.11.11
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 71
17
+ - 105
18
+ "2":
19
+ - 1
20
+ - 5
21
+ - 11
22
+ - 41
23
+ - 49
24
+ - 51
25
+ - 53
26
+ - 55
27
+ - 71
28
+ - 105
29
+ "3":
30
+ - 2
31
+ - 13
32
+ - 16
33
+ - 23
34
+ - 55
35
+ - 61
36
+ "4": 3.11.11
37
+ "5": 0.19.8
38
+ "6": 4.49.0
39
+ "8":
40
+ - 5
41
+ "12": 0.19.8
42
+ "13": linux-x86_64
43
+ bf16:
44
+ value: true
45
+ deepscale:
46
+ value: false
47
+ deepscale_config:
48
+ value: null
49
+ deepspeed:
50
+ value: false
51
+ deepspeed_config:
52
+ value: null
53
+ device:
54
+ value: cuda:0
55
+ epochs:
56
+ value: 1
57
+ eval_datasets:
58
+ value: null
59
+ eval_interval:
60
+ value: 1000000
61
+ eval_split_ratio:
62
+ value: null
63
+ eval_strategy:
64
+ value: epoch
65
+ fp16:
66
+ value: false
67
+ global_rank:
68
+ value: 0
69
+ gradient_accumulation_steps:
70
+ value: 8
71
+ gradient_checkpointing:
72
+ value: true
73
+ local_rank:
74
+ value: 0
75
+ log_dir:
76
+ value: /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k
77
+ log_project:
78
+ value: Inverse_Alignment
79
+ log_run_name:
80
+ value: qwen-4b-s3-Q1-10k
81
+ log_type:
82
+ value: wandb
83
+ lr:
84
+ value: 1e-05
85
+ lr_scheduler_type:
86
+ value: CONSTANT
87
+ lr_warmup_ratio:
88
+ value: 0
89
+ max_length:
90
+ value: 2048
91
+ model_name_or_path:
92
+ value: /aifs4su/hansirui_1st/models/Qwen1.5-4B
93
+ need_eval:
94
+ value: false
95
+ num_update_steps_per_epoch:
96
+ value: 40
97
+ offload:
98
+ value: none
99
+ output_dir:
100
+ value: /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k
101
+ per_device_eval_batch_size:
102
+ value: 4
103
+ per_device_train_batch_size:
104
+ value: 4
105
+ save_16bit:
106
+ value: true
107
+ save_interval:
108
+ value: 1000000
109
+ seed:
110
+ value: 42
111
+ tf32:
112
+ value: true
113
+ total_training_steps:
114
+ value: 40
115
+ train_datasets:
116
+ value:
117
+ - - inverse-json
118
+ - path: /home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json
119
+ proportion: 1
120
+ trust_remote_code:
121
+ value: true
122
+ weight_decay:
123
+ value: 0
124
+ zero_stage:
125
+ value: 3
wandb/run-20250528_202910-skis4g6x/files/output.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/1 epoch: 0%| | 0/313 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 1/1 epoch (loss 1.3117): 100%|██████████| 313/313 [02:55<00:00, 1.79it/s]
4
+ [2025-05-28 20:30:04,563] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
5
+ [2025-05-28 20:30:04,564] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=10, RunningAvgSamplesPerSec=58.59260388728531, CurrSamplesPerSec=54.30743087845937, MemAllocated=8.35GB, MaxMemAllocated=11.42GB
6
+ [2025-05-28 20:30:47,645] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
7
+ [2025-05-28 20:30:47,646] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=20, RunningAvgSamplesPerSec=60.00092597244679, CurrSamplesPerSec=58.60408148477281, MemAllocated=8.35GB, MaxMemAllocated=11.42GB
8
+ [2025-05-28 20:31:29,423] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[1e-05, 1e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
9
+ [2025-05-28 20:31:29,423] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=30, RunningAvgSamplesPerSec=61.1655339875698, CurrSamplesPerSec=65.07105336621166, MemAllocated=8.35GB, MaxMemAllocated=11.42GB
10
+ Saving model to "/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k" ...
11
+ tokenizer config file saved in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/tokenizer_config.json
12
+ Special tokens file saved in /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/special_tokens_map.json
13
+ Saving 16-bit model...
14
+ [2025-05-28 20:32:11,233] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step39 is about to be saved!
15
+ [2025-05-28 20:32:11,234] [INFO] [engine.py:3804:save_16bit_model] Saving model weights to /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/pytorch_model.bin, tag: global_step39
16
+ [2025-05-28 20:32:11,234] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/pytorch_model.bin...
17
+ [2025-05-28 20:32:18,522] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/pytorch_model.bin.
18
+ [2025-05-28 20:32:18,523] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step39 is ready now!
19
+ Model saved!
wandb/run-20250528_202910-skis4g6x/files/requirements.txt ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyYAML==6.0.2
2
+ sentencepiece==0.2.0
3
+ certifi==2025.1.31
4
+ protobuf==5.29.3
5
+ pyasn1==0.6.1
6
+ nvitop==1.4.2
7
+ compressed-tensors==0.9.1
8
+ platformdirs==4.3.6
9
+ idna==3.10
10
+ decorator==5.2.1
11
+ py-spy==0.4.0
12
+ aiosignal==1.3.2
13
+ shellingham==1.5.4
14
+ pytest==8.3.5
15
+ sniffio==1.3.1
16
+ outlines_core==0.1.26
17
+ grpcio==1.70.0
18
+ diskcache==5.6.3
19
+ smmap==5.0.2
20
+ safetensors==0.5.3
21
+ py-cpuinfo==9.0.0
22
+ markdown-it-py==3.0.0
23
+ GitPython==3.1.44
24
+ propcache==0.3.0
25
+ urllib3==2.3.0
26
+ pyarrow==19.0.1
27
+ rich-toolkit==0.13.2
28
+ absl-py==2.1.0
29
+ uc-micro-py==1.0.3
30
+ distlib==0.3.9
31
+ xformers==0.0.29.post3
32
+ huggingface-hub==0.29.2
33
+ typing_extensions==4.12.2
34
+ mistral_common==1.5.3
35
+ airportsdata==20250224
36
+ python-dotenv==1.0.1
37
+ numpy==1.26.4
38
+ aiohappyeyeballs==2.5.0
39
+ typer==0.15.2
40
+ google-api-core==2.24.1
41
+ fastrlock==0.8.3
42
+ pydub==0.25.1
43
+ charset-normalizer==3.4.1
44
+ pybind11==2.13.6
45
+ httptools==0.6.4
46
+ torchaudio==2.6.0
47
+ watchfiles==1.0.4
48
+ orjson==3.10.15
49
+ setuptools==75.8.0
50
+ click==8.1.8
51
+ aiohttp==3.11.13
52
+ cupy-cuda12x==13.4.0
53
+ scipy==1.15.2
54
+ vllm==0.7.3
55
+ soxr==0.5.0.post1
56
+ dill==0.3.8
57
+ gradio_client==1.7.2
58
+ nvidia-cuda-cupti-cu12==12.4.127
59
+ python-multipart==0.0.20
60
+ Pygments==2.19.1
61
+ nvidia-cusparse-cu12==12.3.1.170
62
+ Markdown==3.7
63
+ pydantic==2.10.6
64
+ RapidFuzz==3.12.2
65
+ regex==2024.11.6
66
+ tokenizers==0.21.0
67
+ google-auth==2.38.0
68
+ wrapt==1.17.2
69
+ safehttpx==0.1.6
70
+ numba==0.60.0
71
+ antlr4-python3-runtime==4.13.2
72
+ fire==0.7.0
73
+ llama_stack==0.2.0
74
+ audioread==3.0.1
75
+ interegular==0.3.3
76
+ blobfile==3.0.0
77
+ rich==13.9.4
78
+ xxhash==3.5.0
79
+ nvidia-nvjitlink-cu12==12.4.127
80
+ pyasn1_modules==0.4.1
81
+ resampy==0.4.3
82
+ pydantic_core==2.27.2
83
+ nvidia-cufft-cu12==11.2.1.3
84
+ pycparser==2.22
85
+ cloudpickle==3.1.1
86
+ lazy_loader==0.4
87
+ cffi==1.17.1
88
+ python-dateutil==2.9.0.post0
89
+ triton==3.2.0
90
+ opencensus-context==0.1.3
91
+ jsonschema==4.23.0
92
+ optree==0.16.0
93
+ dnspython==2.7.0
94
+ Jinja2==3.1.6
95
+ align-anything==0.0.1.dev0
96
+ cachetools==5.5.2
97
+ datasets==3.3.2
98
+ iniconfig==2.0.0
99
+ pluggy==1.5.0
100
+ llvmlite==0.43.0
101
+ prometheus-fastapi-instrumentator==7.0.2
102
+ xgrammar==0.1.11
103
+ joblib==1.4.2
104
+ threadpoolctl==3.5.0
105
+ sentry-sdk==2.22.0
106
+ wheel==0.45.1
107
+ mdit-py-plugins==0.4.2
108
+ torchvision==0.21.0
109
+ flash_attn==2.7.4.post1
110
+ networkx==3.4.2
111
+ sympy==1.13.1
112
+ pycountry==24.6.1
113
+ nvidia-cudnn-cu12==9.1.0.70
114
+ jiter==0.8.2
115
+ latex2sympy2_extended==1.10.1
116
+ Levenshtein==0.27.1
117
+ gradio==5.20.0
118
+ lm-format-enforcer==0.10.11
119
+ uvloop==0.21.0
120
+ nvidia-cusparselt-cu12==0.6.2
121
+ astor==0.8.1
122
+ h11==0.14.0
123
+ rpds-py==0.23.1
124
+ pyzmq==26.2.1
125
+ transformers==4.49.0
126
+ librosa==0.10.2.post1
127
+ gitdb==4.0.12
128
+ proto-plus==1.26.0
129
+ mdurl==0.1.2
130
+ deepspeed==0.16.4
131
+ scikit-learn==1.6.1
132
+ yarl==1.18.3
133
+ pycryptodomex==3.22.0
134
+ outlines==0.1.11
135
+ tensorboard==2.19.0
136
+ soundfile==0.13.1
137
+ distro==1.9.0
138
+ opencensus==0.11.4
139
+ ray==2.40.0
140
+ Flask==3.1.0
141
+ lark==1.2.2
142
+ referencing==0.36.2
143
+ setproctitle==1.3.5
144
+ nvidia-cuda-nvrtc-cu12==12.4.127
145
+ nvidia-nvtx-cu12==12.4.127
146
+ docker-pycreds==0.4.0
147
+ opencv-python-headless==4.11.0.86
148
+ nvidia-cuda-runtime-cu12==12.4.127
149
+ packaging==24.2
150
+ nvidia-cusolver-cu12==11.6.1.9
151
+ peft==0.14.0
152
+ pillow==11.1.0
153
+ MarkupSafe==2.1.5
154
+ groovy==0.1.2
155
+ jsonschema-specifications==2024.10.1
156
+ linkify-it-py==2.0.3
157
+ gguf==0.10.0
158
+ fastapi==0.115.11
159
+ einops==0.8.1
160
+ blake3==1.0.4
161
+ accelerate==1.4.0
162
+ zipp==3.21.0
163
+ pyaml==25.1.0
164
+ uvicorn==0.34.0
165
+ httpx==0.28.1
166
+ psutil==7.0.0
167
+ websockets==15.0.1
168
+ ruff==0.9.9
169
+ nvidia-curand-cu12==10.3.5.147
170
+ fsspec==2024.12.0
171
+ nvidia-ml-py==12.570.86
172
+ textual==2.1.2
173
+ pip==25.0
174
+ msgpack==1.1.0
175
+ openai==1.65.4
176
+ diffusers==0.32.2
177
+ annotated-types==0.7.0
178
+ email_validator==2.2.0
179
+ multidict==6.1.0
180
+ tiktoken==0.9.0
181
+ msgspec==0.19.0
182
+ pandas==2.2.3
183
+ wcwidth==0.2.13
184
+ itsdangerous==2.2.0
185
+ smart-open==7.1.0
186
+ math-verify==0.7.0
187
+ nvidia-nccl-cu12==2.21.5
188
+ nest-asyncio==1.6.0
189
+ ninja==1.11.1.3
190
+ virtualenv==20.29.2
191
+ rsa==4.9
192
+ starlette==0.46.0
193
+ anyio==4.8.0
194
+ tensorboard-data-server==0.7.2
195
+ tomlkit==0.13.2
196
+ blinker==1.9.0
197
+ termcolor==3.0.1
198
+ nvidia-cublas-cu12==12.4.5.8
199
+ depyf==0.18.0
200
+ importlib_metadata==8.6.1
201
+ prompt_toolkit==3.0.50
202
+ attrs==25.1.0
203
+ fastapi-cli==0.0.7
204
+ six==1.17.0
205
+ hjson==3.1.0
206
+ memray==1.15.0
207
+ torch==2.6.0
208
+ tzdata==2025.1
209
+ colorful==0.5.6
210
+ googleapis-common-protos==1.69.0
211
+ llama_stack_client==0.2.0
212
+ ffmpy==0.5.0
213
+ frechet-audio-distance==0.1.2
214
+ torchlibrosa==0.1.0
215
+ partial-json-parser==0.2.1.1.post5
216
+ frozenlist==1.5.0
217
+ tqdm==4.67.1
218
+ aiofiles==23.2.1
219
+ aiohttp-cors==0.7.0
220
+ wandb==0.19.8
221
+ filelock==3.17.0
222
+ lxml==5.3.2
223
+ Werkzeug==3.1.3
224
+ multiprocess==0.70.16
225
+ prometheus_client==0.21.1
226
+ mpmath==1.3.0
227
+ pytz==2025.1
228
+ semantic-version==2.10.0
229
+ requests==2.32.3
230
+ pooch==1.8.2
231
+ httpcore==1.0.7
232
+ typing_extensions==4.12.2
233
+ jaraco.functools==4.0.1
234
+ platformdirs==4.2.2
235
+ jaraco.text==3.12.1
236
+ wheel==0.43.0
237
+ inflect==7.3.1
238
+ tomli==2.0.1
239
+ typeguard==4.3.0
240
+ packaging==24.2
241
+ more-itertools==10.3.0
242
+ zipp==3.19.2
243
+ importlib_metadata==8.0.0
244
+ autocommand==2.2.2
245
+ backports.tarfile==1.2.0
246
+ jaraco.context==5.3.0
247
+ jaraco.collections==5.1.0
wandb/run-20250528_202910-skis4g6x/files/wandb-metadata.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-05-28T12:29:11.014551Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--train_datasets",
8
+ "inverse-json::/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json",
9
+ "--model_name_or_path",
10
+ "/aifs4su/hansirui_1st/models/Qwen1.5-4B",
11
+ "--max_length",
12
+ "2048",
13
+ "--trust_remote_code",
14
+ "True",
15
+ "--epochs",
16
+ "1",
17
+ "--per_device_train_batch_size",
18
+ "4",
19
+ "--per_device_eval_batch_size",
20
+ "4",
21
+ "--gradient_accumulation_steps",
22
+ "8",
23
+ "--gradient_checkpointing",
24
+ "--learning_rate",
25
+ "1e-5",
26
+ "--lr_warmup_ratio",
27
+ "0",
28
+ "--weight_decay",
29
+ "0.0",
30
+ "--lr_scheduler_type",
31
+ "constant",
32
+ "--weight_decay",
33
+ "0.0",
34
+ "--seed",
35
+ "42",
36
+ "--output_dir",
37
+ "/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k",
38
+ "--log_type",
39
+ "wandb",
40
+ "--log_run_name",
41
+ "qwen-4b-s3-Q1-10k",
42
+ "--log_project",
43
+ "Inverse_Alignment",
44
+ "--zero_stage",
45
+ "3",
46
+ "--offload",
47
+ "none",
48
+ "--bf16",
49
+ "True",
50
+ "--tf32",
51
+ "True",
52
+ "--save_16bit"
53
+ ],
54
+ "program": "-m safe_rlhf.finetune.__main__",
55
+ "email": "[email protected]",
56
+ "root": "/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k",
57
+ "host": "dgx-096",
58
+ "executable": "/aifs4su/hansirui_1st/miniconda3/envs/by-align/bin/python",
59
+ "cpu_count": 112,
60
+ "cpu_count_logical": 224,
61
+ "gpu": "NVIDIA H800",
62
+ "gpu_count": 8,
63
+ "disk": {
64
+ "/": {
65
+ "total": "1888556142592",
66
+ "used": "78049411072"
67
+ }
68
+ },
69
+ "memory": {
70
+ "total": "2164195426304"
71
+ },
72
+ "cpu": {
73
+ "count": 112,
74
+ "countLogical": 224
75
+ },
76
+ "gpu_nvidia": [
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ },
101
+ {
102
+ "name": "NVIDIA H800",
103
+ "memoryTotal": "85520809984",
104
+ "cudaCores": 16896,
105
+ "architecture": "Hopper"
106
+ },
107
+ {
108
+ "name": "NVIDIA H800",
109
+ "memoryTotal": "85520809984",
110
+ "cudaCores": 16896,
111
+ "architecture": "Hopper"
112
+ },
113
+ {
114
+ "name": "NVIDIA H800",
115
+ "memoryTotal": "85520809984",
116
+ "cudaCores": 16896,
117
+ "architecture": "Hopper"
118
+ },
119
+ {
120
+ "name": "NVIDIA H800",
121
+ "memoryTotal": "85520809984",
122
+ "cudaCores": 16896,
123
+ "architecture": "Hopper"
124
+ }
125
+ ],
126
+ "slurm": {
127
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
128
+ },
129
+ "cudaVersion": "12.2"
130
+ }
wandb/run-20250528_202910-skis4g6x/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":187},"train/loss":1.3116559982299805,"train/lr":1e-05,"train/epoch":1,"_timestamp":1.748435527024011e+09,"_runtime":187.56364447,"_step":313,"train/step":313}
wandb/run-20250528_202910-skis4g6x/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-05-28T20:29:10.413633415+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpzqojq8hk/port-660901.txt","pid":660901,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-05-28T20:29:10.414535077+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":660901}
3
+ {"time":"2025-05-28T20:29:10.414534894+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38529,"Zone":""}}
4
+ {"time":"2025-05-28T20:29:10.595145722+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:55176"}
5
+ {"time":"2025-05-28T20:29:11.015926532+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"skis4g6x","id":"127.0.0.1:55176"}
6
+ {"time":"2025-05-28T20:29:11.23321905+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"skis4g6x","id":"127.0.0.1:55176"}
7
+ {"time":"2025-05-28T20:32:19.92912143+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:55176"}
8
+ {"time":"2025-05-28T20:32:19.929338802+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:55176"}
9
+ {"time":"2025-05-28T20:32:19.92941574+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:55176"}
10
+ {"time":"2025-05-28T20:32:19.92942195+08:00","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2025-05-28T20:32:20.209269705+08:00","level":"ERROR","msg":"processOutgoingData: flush error","error":"write tcp 127.0.0.1:38529->127.0.0.1:55176: use of closed network connection","id":"127.0.0.1:55176"}
12
+ {"time":"2025-05-28T20:32:20.209674561+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:55176"}
13
+ {"time":"2025-05-28T20:32:20.209688302+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:55176"}
14
+ {"time":"2025-05-28T20:32:20.209693575+08:00","level":"INFO","msg":"server is closed"}
wandb/run-20250528_202910-skis4g6x/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-05-28T20:29:11.018472061+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs/debug-core.log"}
2
+ {"time":"2025-05-28T20:29:11.233125615+08:00","level":"INFO","msg":"created new stream","id":"skis4g6x"}
3
+ {"time":"2025-05-28T20:29:11.233210455+08:00","level":"INFO","msg":"stream: started","id":"skis4g6x"}
4
+ {"time":"2025-05-28T20:29:11.233234745+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"skis4g6x"}
5
+ {"time":"2025-05-28T20:29:11.233240091+08:00","level":"INFO","msg":"handler: started","stream_id":"skis4g6x"}
6
+ {"time":"2025-05-28T20:29:11.233251471+08:00","level":"INFO","msg":"sender: started","stream_id":"skis4g6x"}
7
+ {"time":"2025-05-28T20:29:11.490072877+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-05-28T20:32:18.578212079+08:00","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-05-28T20:32:18.578786481+08:00","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-05-28T20:32:19.92936316+08:00","level":"INFO","msg":"stream: closing","id":"skis4g6x"}
11
+ {"time":"2025-05-28T20:32:19.929423159+08:00","level":"WARN","msg":"sender: received Exit record more than once, ignoring"}
12
+ {"time":"2025-05-28T20:32:19.979380287+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-05-28T20:32:20.209198879+08:00","level":"INFO","msg":"handler: closed","stream_id":"skis4g6x"}
14
+ {"time":"2025-05-28T20:32:20.209242482+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"skis4g6x"}
15
+ {"time":"2025-05-28T20:32:20.209300836+08:00","level":"INFO","msg":"sender: closed","stream_id":"skis4g6x"}
16
+ {"time":"2025-05-28T20:32:20.209596534+08:00","level":"INFO","msg":"stream: closed","id":"skis4g6x"}
wandb/run-20250528_202910-skis4g6x/logs/debug.log ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-05-28 20:29:11,009 INFO MainThread:660901 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Configure stats pid to 660901
3
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Loading settings from /home/hansirui_1st/.config/wandb/settings
4
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Loading settings from /home/hansirui_1st/jiayi/resist/setting3/scripts/wandb/settings
5
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs/debug.log
7
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k/wandb/run-20250528_202910-skis4g6x/logs/debug-internal.log
8
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():761] calling init triggers
9
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'model_name_or_path': '/aifs4su/hansirui_1st/models/Qwen1.5-4B', 'max_length': 2048, 'trust_remote_code': True, 'train_datasets': [('inverse-json', {'proportion': 1.0, 'path': '/home/hansirui_1st/jiayi/resist/setting3/safety_data/training/safe/safe_10k.json'})], 'eval_datasets': None, 'epochs': 1, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 8, 'gradient_checkpointing': True, 'lr': 1e-05, 'lr_scheduler_type': <SchedulerType.CONSTANT: 'constant'>, 'lr_warmup_ratio': 0.0, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': True, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': False, 'eval_split_ratio': None, 'output_dir': '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', 'log_type': 'wandb', 'log_dir': '/aifs4su/hansirui_1st/boyuan/resist/setting3-safety/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-10k', 'log_project': 'Inverse_Alignment', 'log_run_name': 'qwen-4b-s3-Q1-10k', 'save_16bit': True, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 3, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 40, 'total_training_steps': 40, '_wandb': {}}
11
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():784] starting backend
12
+ 2025-05-28 20:29:11,010 INFO MainThread:660901 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-05-28 20:29:11,014 INFO MainThread:660901 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-05-28 20:29:11,014 INFO MainThread:660901 [wandb_init.py:init():798] backend started and connected
15
+ 2025-05-28 20:29:11,015 INFO MainThread:660901 [wandb_init.py:init():891] updated telemetry
16
+ 2025-05-28 20:29:11,016 INFO MainThread:660901 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-05-28 20:29:11,484 INFO MainThread:660901 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-05-28 20:29:11,697 INFO MainThread:660901 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-05-28 20:29:11,699 INFO MainThread:660901 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-05-28 20:32:18,575 INFO MainThread:660901 [wandb_run.py:_finish():2112] finishing run xtom/Inverse_Alignment/skis4g6x
24
+ 2025-05-28 20:32:18,576 INFO MainThread:660901 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-05-28 20:32:18,577 INFO MainThread:660901 [wandb_run.py:_restore():2322] restore
26
+ 2025-05-28 20:32:18,577 INFO MainThread:660901 [wandb_run.py:_restore():2328] restore done
27
+ 2025-05-28 20:32:19,578 INFO MainThread:660901 [wandb_run.py:_restore():2322] restore
28
+ 2025-05-28 20:32:19,578 INFO MainThread:660901 [wandb_run.py:_restore():2328] restore done
29
+ 2025-05-28 20:32:19,578 ERROR MainThread:660901 [wandb_run.py:_atexit_cleanup():2361] Problem finishing run
30
+ Traceback (most recent call last):
31
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup
32
+ self._on_finish()
33
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish
34
+ wait_with_progress(
35
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
36
+ return wait_all_with_progress(
37
+ ^^^^^^^^^^^^^^^^^^^^^^^
38
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
39
+ return asyncio_compat.run(progress_loop_with_timeout)
40
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run
42
+ future = executor.submit(runner.run, fn)
43
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
44
+ File "/aifs4su/hansirui_1st/miniconda3/envs/by-align/lib/python3.11/concurrent/futures/thread.py", line 169, in submit
45
+ raise RuntimeError('cannot schedule new futures after '
46
+ RuntimeError: cannot schedule new futures after interpreter shutdown
47
+ 2025-05-28 20:32:19,928 INFO MsgRouterThr:660901 [mailbox.py:close():129] Closing mailbox, abandoning 2 handles.
wandb/run-20250528_202910-skis4g6x/run-skis4g6x.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7793d6aacd2694588c2caa5999a3002fca95a71bb19f40127fa43174046d9b44
3
+ size 286834