YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
Model trained in following setup:
variables:
global_seed: 17
max_seq_len: 8192
# Run Name
run_name: llama32_clm34_Q512xP400_8192
max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
model:
name: hf_causal_lm
pretrained: true
pretrained_model_name_or_path: mfajcik/clm_llama32_8K # RECIPE from LLama3.2: first pretrain on 2k context, then 2 epochs on 8k context, then 1 epoch with hard negatives
use_flash_attention_2: true
copy_config:
copy_implementation: v3.4_fulljoint_ss
copy_token: "|copy|"
config_overrides:
span_heads: 256
span_d: 6144
# Queries
K_past_positives: 312 # how many 'positives' to take from tokens with copy position annotated (span gt)
K_past_negatives: 200 # how many 'negatives' to sample from tokens with without copy position annotation (token gt)
# Past states - how many candidates to consider for starts/ends for each query (number includes gt if available + negatives)
K_start: 400
K_end: 400
smart_sampling: true
hn_topk_positions: 800
reweighting: false
# sparse_bmm_triton: true
# Tokenizer
tokenizer:
name: meta-llama/Llama-3.2-3B-Instruct
kwargs:
model_max_length: ${variables.max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: mfajcik/WildChat-copyN2_llama31_L8192
split: train
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: true
decoder_only_format: true
shuffle: true
preprocessing_fn: src.preprocessing.utils:filter_prompt_response
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
eval_loader:
name: finetuning
dataset:
hf_name: mfajcik/WildChat-copyN2_llama31_L8192
split: validation # change to validation in later experiments
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
shuffle: false
preprocessing_fn: src.preprocessing.utils:filter_prompt_response
drop_last: true
num_workers: 8
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 0
## Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 200ba
alpha_f: 0.1
optimizer:
name: decoupled_lionw
lr: 5e-6
betas:
- 0.9
- 0.95
weight_decay: 0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 2.0
max_duration: 2ep
eval_interval: 100ba
eval_first: false
global_train_batch_size: 128
# System
seed: ${variables.global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16
# Logging
progress_bar: true
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: { }
memory_monitor: { }
runtime_estimator: { }
loggers:
wandb:
project: "copylm" # Replace with your project name
entity: "ifajcik-brno-university-of-technology" # Replace with your username or team name
name: "DBGN_Q512xP400_8192" # Optional: name of the current experiment
# Checkpoint to local filesystem or remote object store
save_interval: 100ba
autoresume: true
save_num_checkpoints_to_keep: 25 # Important, this cleans up checkpoints saved to DISK
load_weights_only: true
save_folder: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/${variables.run_name}
- Downloads last month
- 22
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support