2023-04-12 09:59:05 +00:00
|
|
|
|
data:
|
2023-04-24 11:13:53 +00:00
|
|
|
|
mode: "pretrain"
|
|
|
|
|
data:
|
2023-04-28 07:01:01 +00:00
|
|
|
|
wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
|
|
|
|
|
# 由于加载了Llama模型的ckpt所以只使用少量英文数据
|
|
|
|
|
the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
|
2023-04-27 11:42:06 +00:00
|
|
|
|
pad_to_max: False
|
2023-04-27 15:42:11 +00:00
|
|
|
|
sequence_sample_mode: "none"
|
2023-04-24 11:13:53 +00:00
|
|
|
|
concat_multiple_sequence: True
|
2023-04-27 15:42:11 +00:00
|
|
|
|
num_sequences: 10
|
2023-04-24 11:13:53 +00:00
|
|
|
|
seq_length: 2048
|
2023-05-06 15:37:17 +00:00
|
|
|
|
tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
|
|
|
|
|
split_by_shard: False
|
2023-04-12 09:59:05 +00:00
|
|
|
|
train:
|
|
|
|
|
train_batch_size: 2
|
2023-04-28 07:01:01 +00:00
|
|
|
|
num_training_steps: 500000
|
2023-04-12 09:59:05 +00:00
|
|
|
|
num_warmup_steps: 2000
|
|
|
|
|
initializer_range: 1.0e-2
|
|
|
|
|
lr: 2.0e-4
|
|
|
|
|
weight_decay: 1.0e-1
|
2023-04-28 07:01:01 +00:00
|
|
|
|
# 加载预训练权重,从头训练设为null
|
2023-04-27 11:42:06 +00:00
|
|
|
|
ckpt: "data/llama_raw_ckpt/7B/extended.pth"
|
2023-04-24 11:13:53 +00:00
|
|
|
|
train_num_workers: 16
|
2023-04-27 11:42:06 +00:00
|
|
|
|
gradient_accumulation_steps: 12
|
2023-04-27 15:42:11 +00:00
|
|
|
|
prefetch_factor: 100
|
2023-05-06 15:37:17 +00:00
|
|
|
|
train_and_eval: False
|
2023-05-09 07:16:50 +00:00
|
|
|
|
gradient_checkpointing_enable: False
|
|
|
|
|
use_lora: False
|
2023-04-12 09:59:05 +00:00
|
|
|
|
# global step
|
|
|
|
|
log_interval: 5
|
2023-04-28 07:01:01 +00:00
|
|
|
|
eval_interval: 500
|
|
|
|
|
save_interval: 1000
|
2023-04-27 11:42:06 +00:00
|
|
|
|
work_dir: "data/saved_ckpt/7B"
|
2023-04-12 09:59:05 +00:00
|
|
|
|
project_name: "Llama Pretrain"
|