joeynmt-dyu-fr-v11.0 / config.yaml
Adeptschneider's picture
Upload folder using huggingface_hub
3bf0bee verified
name: "dyu_fr_transformer-sp"
joeynmt_version: "2.3.0"
model_dir: "saved_model/dyu_fr"
use_cuda: True # False for CPU training
fp16: True
data:
train: "data/dyu_fr"
dev: "data/dyu_fr"
test: "data/dyu_fr"
dataset_type: "huggingface"
dataset_cfg:
name: "dyu-fr"
sample_dev_subset: 1460
src:
lang: "dyu"
max_length: 100
lowercase: False
normalize: False
level: "bpe"
voc_limit: 4000
voc_min_freq: 1
voc_file: "data/dyu_fr/vocab.txt"
tokenizer_type: "sentencepiece"
tokenizer_cfg:
model_file: "data/dyu_fr/sp.model"
trg:
lang: "fr"
max_length: 100
lowercase: False
normalize: False
level: "bpe"
voc_limit: 4000
voc_min_freq: 1
voc_file: "data/dyu_fr/vocab.txt"
tokenizer_type: "sentencepiece"
tokenizer_cfg:
model_file: "data/dyu_fr/sp.model"
special_symbols:
unk_token: "<unk>"
unk_id: 0
pad_token: "<pad>"
pad_id: 1
bos_token: "<s>"
bos_id: 2
eos_token: "</s>"
eos_id: 3
testing:
load_model: "models/best.ckpt"
n_best: 1
beam_size: 10
beam_alpha: 1.2
batch_size: 256
batch_type: "token"
max_output_length: 100
eval_metrics: ["bleu"]
#return_prob: "hyp"
#return_attention: False
sacrebleu_cfg:
tokenize: "13a"
training:
load_model: "joeynmt-models-v10.0/24300.ckpt"
#reset_best_ckpt: False
#reset_scheduler: False
#reset_optimizer: False
#reset_iter_state: False
random_seed: 42
optimizer: "adamw"
normalization: "tokens"
adam_betas: [0.9, 0.98]
scheduling: "warmupinversesquareroot"
learning_rate_warmup: 8000
learning_rate: 0.0003
learning_rate_min: 0.00000001
weight_decay: 0.0001
label_smoothing: 0.1
loss: "crossentropy"
batch_size: 8192
batch_type: "token"
batch_multiplier: 4
early_stopping_metric: "bleu"
epochs: 1800
updates: 90000
validation_freq: 50
logging_freq: 10
overwrite: True
shuffle: True
print_valid_sents: [0, 1, 2, 3]
keep_best_ckpts: 3
model:
initializer: "xavier_uniform"
bias_initializer: "zeros"
init_gain: 1.0
embed_initializer: "xavier_uniform"
embed_init_gain: 1.0
tied_embeddings: True
tied_softmax: True
encoder:
type: "transformer"
num_layers: 6
num_heads: 4
embeddings:
embedding_dim: 256
scale: True
dropout: 0.1
# typically ff_size = 4 x hidden_size
hidden_size: 256
ff_size: 1024
dropout: 0.2
layer_norm: "pre"
decoder:
type: "transformer"
num_layers: 6
num_heads: 4
embeddings:
embedding_dim: 256
scale: True
dropout: 0.1
# typically ff_size = 4 x hidden_size
hidden_size: 256
ff_size: 1024
dropout: 0.2
layer_norm: "pre"