You need to agree to share your contact information to access this model

This repository is publicly accessible, but you have to accept the conditions to access its files and content.

YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

Model trained in following setup:

variables:
  global_seed: 17
  max_seq_len: 8192
  # Run Name
  run_name: llama32_clm34_Q512xP400_8192

max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}

fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false


model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: mfajcik/clm_llama32_8K # RECIPE from LLama3.2: first pretrain on 2k context, then 2 epochs on 8k context, then 1 epoch with hard negatives

  use_flash_attention_2: true
  copy_config:
    copy_implementation: v3.4_fulljoint_ss
    copy_token: "|copy|"

  config_overrides:
    span_heads: 256
    span_d: 6144
    # Queries
    K_past_positives: 312 # how many 'positives' to take from tokens with copy position annotated (span gt)
    K_past_negatives: 200 # how many 'negatives' to sample from tokens with without copy position annotation (token gt)
    # Past states - how many candidates to consider for starts/ends for each query (number includes gt if available + negatives)
    K_start: 400
    K_end: 400
    smart_sampling: true
    hn_topk_positions: 800
    reweighting: false
#    sparse_bmm_triton: true

# Tokenizer
tokenizer:
  name: meta-llama/Llama-3.2-3B-Instruct
  kwargs:
    model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
  name: finetuning
  dataset:
    hf_name: mfajcik/WildChat-copyN2_llama31_L8192
    split: train
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: true
    decoder_only_format: true
    shuffle: true
    preprocessing_fn: src.preprocessing.utils:filter_prompt_response
  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

eval_loader:
  name: finetuning
  dataset:
    hf_name: mfajcik/WildChat-copyN2_llama31_L8192
    split: validation # change to validation in later experiments
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    shuffle: false
    preprocessing_fn: src.preprocessing.utils:filter_prompt_response

  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

## Optimization
scheduler:
  name: cosine_with_warmup
  t_warmup: 200ba
  alpha_f: 0.1

optimizer:
  name: decoupled_lionw
  lr: 5e-6
  betas:
    - 0.9
    - 0.95
  weight_decay: 0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 2.0

max_duration: 2ep
eval_interval: 100ba


eval_first: false
global_train_batch_size: 128

# System
seed: ${variables.global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16

# Logging
progress_bar: true
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: { }
  memory_monitor: { }
  runtime_estimator: { }

loggers:
  wandb:
    project: "copylm"  # Replace with your project name
    entity: "ifajcik-brno-university-of-technology"  # Replace with your username or team name
    name: "DBGN_Q512xP400_8192"  # Optional: name of the current experiment

# Checkpoint to local filesystem or remote object store
save_interval: 100ba
autoresume: true
save_num_checkpoints_to_keep: 25  # Important, this cleans up checkpoints saved to DISK

load_weights_only: true

save_folder: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/${variables.run_name}

Downloads last month: 22

Safetensors

Model size

3B params

Tensor type

F32

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support