model: osum_echat

# llm_path
llm_path: &llm_path "Qwen/Qwen2.5-3B-Instruct"

#
# model config
downsample_rate: 4 # 1 2 4 8
adapter_type: osum_echat
if_instruct: true
input_dim: 80

# tokenizer ,gxl
tokenizer: huggingface
tokenizer_conf:
  llm_path: *llm_path

# lora config
use_lora: false
lora_alpha: 32
lora_rank: 64 # 3B -> 85M
lora_dropout: 0.1

# speech generate config
speech_token_num: &token_num 4097 #4097


# Configuration of parameters for training
fire_module: link_and_encoder_and_lora  # link  encoder llm  link_and_encoder link_and_encoder_and_lora, llm需要配合use_lora为true

# other config
grad_clip: 5
accum_grad: 8
log_interval: 10
save_interval: 1250 #1250 #2500
max_epoch: 1
init_step: true

# training config
optim: adamw
optim_conf:
  betas:
  - 0.9
  - 0.99
  eps: 1.0e-06
  lr: 1.0e-06
  weight_decay: 0.01
scheduler: warmuplr
scheduler_conf:
  warmup_steps: 2000


dataset: asr
dataset_conf:
  speech_token_num: *token_num
  batch_conf:
    batch_size: 26
    batch_type: dynamic
    max_frames_in_batch: 28000000 #3000 #9000 #3000 #3300 # 3900
    max_seq_in_batch: 3700 #1500 #4000 #1100 #1600 # 1900
  feats_type: log_mel_spectrogram
  filter_conf:
    max_length: 20000
    min_length: 20
    token_max_length: 1200
    token_min_length: 1
    filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉
    max_seq_len: 2000  #、1100 #1000
    other_filter_conf:
      only_s2s: false # 只针对与s2s dataloader的过滤
      only_s2t: false # 只针对与s2t dataloader的过滤
      only_t2t: false # 只针对与t2t dataloader的过滤
      only_t2s: false # 只针对与t2s dataloader的过滤
  language_conf:
    limited_langs:
    - zh
  log_mel_spectrogram_conf:
    hop_length: 160
    n_fft: 400
    num_mel_bins: 80
    padding: 0
  resample_conf:
    resample_rate: 16000
  shuffle: true
  shuffle_conf:
    shuffle_size: 1500
  sort: true
  sort_conf:
    sort_size: 500
  spec_aug: true
  spec_aug_conf:
    max_f: 10
    max_t: 50
    num_f_mask: 2
    num_t_mask: 2
  spec_sub: true
  spec_sub_conf:
    max_t: 30
    num_t_sub: 3
  spec_trim: false
  speed_perturb: false
  eod_id: 151645
  split_num: 1
  multi_num: 2
  prompt_conf_path: conf/prompt_config.yaml
  data_recover: false
  data_recover_conf:
    start_idx: 0 # 删除前面start_idx个item(tar包)
  other_tokenze_conf:  # 一些对数据额外操作的可控按钮,这些操作一般来说再test时都得为false
    only_info:
      only_s2s: false # 只针对与s2s dataloader的过滤
      only_s2t: false # 只针对与s2t dataloader的过滤
      only_t2t: false # 只针对与t2t dataloader的过滤
      only_t2s: false # 只针对与t2s dataloader的过滤
    use_50_per_change_if_only_X: true # 50%的句子随机替换为其only X
    use_s2s_streaming_random:
      enable: false
      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
    natural_language_convert:
      enable: false
      rate: 0.00 # 1.0 表示100%的转换成自然语言模式
    use_s2s_convert_s2t:
      enable: false # 单独为s2t dataloader 开启s2s convert
      rate: 1.0 # 1.0 表示100%的句子随机替换为其only X
    use_streaming_tts:
      enable: false
      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
    use_think_mode:
      enable: false # 开启think 模式, 即随机替换为think模式的句子
      rate: 0.8
  other_filter_conf:
    fiter_txt_is_None: true # 过滤掉text is "<NONE>"的语音数据,适配由于gender数据部分含有<NONE>标签而设计。但仅train起作用

# model config for encoder
encoder: transformer
encoder_conf:
  activation_type: gelu
  attention_dropout_rate: 0.0
  attention_heads: 16
  dropout_rate: 0.1
  gradient_checkpointing: true
  input_layer: conv1d2
  key_bias: false
  linear_units: 4096
  normalize_before: true
  num_blocks: 24
  output_size: 1024
  pos_enc_layer_type: abs_pos_whisper
  positional_dropout_rate: 0.1
  static_chunk_size: -1
  use_dynamic_chunk: false
  use_dynamic_left_chunk: false