| # This file defines the SGD-related parameters for Marian trainings. | |
| # This is the teacher configuration. | |
| seed: 141414 | |
| # cost | |
| cost-type: ce-sum | |
| label-smoothing: 0.1 | |
| # optimizer config | |
| optimizer: adam | |
| learn-rate: 0.0005 | |
| lr-warmup: 4000 | |
| lr-decay-inv-sqrt: 4000 | |
| mini-batch-warmup: 4000 | |
| mini-batch-round-up: true | |
| optimizer-params: | |
| - 0.9 | |
| - 0.999 | |
| - 1e-08 | |
| - 0.01 | |
| clip-norm: 0 | |
| dynamic-gradient-scaling: | |
| - 2 | |
| - log | |
| exponential-smoothing: 1e-3 | |
| # alignment | |
| guided-alignment-weight: 0 | |
| # batch-size related parameters | |
| mini-batch-fit: true | |
| mini-batch-fit-step: 5 | |
| maxi-batch: 1000 | |
| mini-batch: 1000 | |
| mini-batch-words: 500000 | |
| max-length: 256 | |
| # validation-related parameters | |
| # Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname. | |
| # Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate. | |
| early-stopping: 40 | |
| valid-mini-batch: 32 | |
| beam-size: 4 | |
| normalize: 1.0 | |
| word-penalty: 0.0 | |
| valid-max-length: 1000 | |
| n-best: false | |
| # general parameters | |
| logical-epoch: 1Gt | |
| after: 40e | |
| valid-freq: 1Gt | |
| save-freq: 1Gt | |
| disp-freq: 100Mt | |
| disp-label-counts: true | |
| lr-report: true | |
| sync-sgd: true | |
| shuffle: batches | |
| shuffle-in-ram: true | |
| disp-first: 10 | |
| # multi-node sharding mode, irrelevant for single-node | |
| sharding: local | |
| sync-freq: 200u | |
| fp16: false | |
| # https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents | |
| # for fp16 stability | |
| cost-scaling: | |
| - 256.f | |
| - 10000 | |
| - 1.f | |
| - 256.f | |
| # model structure | |
| type: transformer | |
| # Flo generates separate vocabs, so don't tie between source and target | |
| tied-embeddings: true | |
| tied-embeddings-all: true | |
| tied-embeddings-src: false | |
| # dimensions | |
| dim-emb: 1024 | |
| enc-depth: 6 | |
| dec-depth: 6 | |
| transformer-dim-ffn: 8192 | |
| transformer-decoder-dim-ffn: 8192 | |
| transformer-depth-scaling: true | |
| lemma-dim-emb: 0 | |
| max-length: 256 | |
| # architecture details | |
| transformer-decoder-autoreg: self-attention | |
| transformer-tied-layers: [] | |
| # further transformer details | |
| transformer-ffn-activation: relu | |
| transformer-heads: 8 | |
| transformer-postprocess-emb: d | |
| transformer-postprocess: dan | |
| transformer-dropout: 0.1 | |
| transformer-dropout-attention: 0 | |
| transformer-dropout-ffn: 0.1 | |
| # data munging | |
| all-caps-every: 0 | |
| english-title-case-every: 0 | |
| log-time-zone: PST8PDT | |
| quiet-translation: true | |
| keep-best: true | |
| overwrite: false | |
| interpolate-env-vars: true | |
| log: train.log | |
| valid-log: valid.log | |
| valid-translation-output: valid.trg.output | |