model: osum_echat # llm_path llm_path: &llm_path "Qwen/Qwen2.5-3B-Instruct" # # model config downsample_rate: 4 # 1 2 4 8 adapter_type: osum_echat if_instruct: true input_dim: 80 # tokenizer ,gxl tokenizer: huggingface tokenizer_conf: llm_path: *llm_path # lora config use_lora: false lora_alpha: 32 lora_rank: 64 # 3B -> 85M lora_dropout: 0.1 # speech generate config speech_token_num: &token_num 4097 #4097 # Configuration of parameters for training fire_module: link_and_encoder_and_lora # link encoder llm link_and_encoder link_and_encoder_and_lora, llm需要配合use_lora为true # other config grad_clip: 5 accum_grad: 8 log_interval: 10 save_interval: 1250 #1250 #2500 max_epoch: 1 init_step: true # training config optim: adamw optim_conf: betas: - 0.9 - 0.99 eps: 1.0e-06 lr: 1.0e-06 weight_decay: 0.01 scheduler: warmuplr scheduler_conf: warmup_steps: 2000 dataset: asr dataset_conf: speech_token_num: *token_num batch_conf: batch_size: 26 batch_type: dynamic max_frames_in_batch: 28000000 #3000 #9000 #3000 #3300 # 3900 max_seq_in_batch: 3700 #1500 #4000 #1100 #1600 # 1900 feats_type: log_mel_spectrogram filter_conf: max_length: 20000 min_length: 20 token_max_length: 1200 token_min_length: 1 filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉 max_seq_len: 2000 #、1100 #1000 other_filter_conf: only_s2s: false # 只针对与s2s dataloader的过滤 only_s2t: false # 只针对与s2t dataloader的过滤 only_t2t: false # 只针对与t2t dataloader的过滤 only_t2s: false # 只针对与t2s dataloader的过滤 language_conf: limited_langs: - zh log_mel_spectrogram_conf: hop_length: 160 n_fft: 400 num_mel_bins: 80 padding: 0 resample_conf: resample_rate: 16000 shuffle: true shuffle_conf: shuffle_size: 1500 sort: true sort_conf: sort_size: 500 spec_aug: true spec_aug_conf: max_f: 10 max_t: 50 num_f_mask: 2 num_t_mask: 2 spec_sub: true spec_sub_conf: max_t: 30 num_t_sub: 3 spec_trim: false speed_perturb: false eod_id: 151645 split_num: 1 multi_num: 2 prompt_conf_path: conf/prompt_config.yaml data_recover: false data_recover_conf: start_idx: 0 # 删除前面start_idx个item(tar包) other_tokenze_conf: # 一些对数据额外操作的可控按钮,这些操作一般来说再test时都得为false only_info: only_s2s: false # 只针对与s2s dataloader的过滤 only_s2t: false # 只针对与s2t dataloader的过滤 only_t2t: false # 只针对与t2t dataloader的过滤 only_t2s: false # 只针对与t2s dataloader的过滤 use_50_per_change_if_only_X: true # 50%的句子随机替换为其only X use_s2s_streaming_random: enable: false rate: 0.5 # 1.0 表示100%的句子随机替换为其only X natural_language_convert: enable: false rate: 0.00 # 1.0 表示100%的转换成自然语言模式 use_s2s_convert_s2t: enable: false # 单独为s2t dataloader 开启s2s convert rate: 1.0 # 1.0 表示100%的句子随机替换为其only X use_streaming_tts: enable: false rate: 0.5 # 1.0 表示100%的句子随机替换为其only X use_think_mode: enable: false # 开启think 模式, 即随机替换为think模式的句子 rate: 0.8 other_filter_conf: fiter_txt_is_None: true # 过滤掉text is ""的语音数据,适配由于gender数据部分含有标签而设计。但仅train起作用 # model config for encoder encoder: transformer encoder_conf: activation_type: gelu attention_dropout_rate: 0.0 attention_heads: 16 dropout_rate: 0.1 gradient_checkpointing: true input_layer: conv1d2 key_bias: false linear_units: 4096 normalize_before: true num_blocks: 24 output_size: 1024 pos_enc_layer_type: abs_pos_whisper positional_dropout_rate: 0.1 static_chunk_size: -1 use_dynamic_chunk: false use_dynamic_left_chunk: false