Open-Llama/configs/pretrain_config.yaml

34 lines
966 B
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

data:
mode: "pretrain"
data:
wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
# 由于加载了Llama模型的ckpt所以只使用少量英文数据
the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
pad_to_max: False
sequence_sample_mode: "none"
concat_multiple_sequence: True
num_sequences: 10
seq_length: 2048
tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
split_by_shard: False
train:
train_batch_size: 2
num_training_steps: 500000
num_warmup_steps: 2000
initializer_range: 1.0e-2
lr: 2.0e-4
weight_decay: 1.0e-1
# 加载预训练权重从头训练设为null
ckpt: "data/llama_raw_ckpt/7B/extended.pth"
train_num_workers: 16
gradient_accumulation_steps: 12
prefetch_factor: 100
train_and_eval: False
gradient_checkpointing_enable: True
# global step
log_interval: 5
eval_interval: 500
save_interval: 1000
work_dir: "data/saved_ckpt/7B"
project_name: "Llama Pretrain"