diff --git "a/log" "b/log" new file mode 100644--- /dev/null +++ "b/log" @@ -0,0 +1,1349 @@ +wandb: Currently logged in as: u-hyszk. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.16.6 +wandb: Run data is saved locally in /disk/fs1/bigtmp/hayashizaki.yu.t5/bitnet-b158/wandb/run-20240429_114341-c6ka7z13 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run giddy-deluge-22 +wandb: ⭐️ View project at https://wandb.ai/u-hyszk/bitnet-b158 +wandb: 🚀 View run at https://wandb.ai/u-hyszk/bitnet-b158/runs/c6ka7z13 +Namespace(context_length=128, batch_size=32, output='bitnet-b158-wiki40b-ja', hidden_size=768, intermediate_size=1536, num_attention_heads=12, num_hidden_layers=12, num_key_value_heads=4, logging_steps=2000, eval_steps=2000, save_steps=2000, gradient_accumulation_steps=1, warmup_steps=5000, learning_rate=0.0024, adam_beta1=0.9, adam_beta2=0.95, weight_decay=0.1) +BitLlamaForCausalLM( + (model): BitLlamaModel( + (embed_tokens): Embedding(43176, 768) + (layers): ModuleList( + (0-11): 12 x BitLlamaDecoderLayer( + (self_attn): BitLlamaAttention( + (q_proj): BitLinear158b( + in_features=768, out_features=768, bias=False + (layernorm): BitRMSNorm() + ) + (k_proj): BitLinear158b( + in_features=768, out_features=256, bias=False + (layernorm): BitRMSNorm() + ) + (v_proj): BitLinear158b( + in_features=768, out_features=256, bias=False + (layernorm): BitRMSNorm() + ) + (o_proj): BitLinear158b( + in_features=768, out_features=768, bias=False + (layernorm): BitRMSNorm() + ) + (rotary_emb): LlamaRotaryEmbedding() + ) + (mlp): BitLlamaMLP( + (gate_proj): BitLinear158b( + in_features=768, out_features=1536, bias=False + (layernorm): BitRMSNorm() + ) + (up_proj): BitLinear158b( + in_features=768, out_features=1536, bias=False + (layernorm): BitRMSNorm() + ) + (down_proj): BitLinear158b( + in_features=1536, out_features=768, bias=False + (layernorm): BitRMSNorm() + ) + (act_fn): SiLU() + ) + ) + ) + (norm): LlamaRMSNorm() + ) + (lm_head): BitLinear( + in_features=768, out_features=43176, bias=False, flg_before_linear=True + (layernorm): BitRMSNorm() + ) +) + Map: 0%| | 0/670852 [00:00