Open-Llama/configs/pretrain_config.yaml

data:
  mode: "pretrain"
  data: 
    wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
    # 由于加载了Llama模型的ckpt所以只使用少量英文数据
    the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
  pad_to_max: False
  sequence_sample_mode: "none"
  concat_multiple_sequence: True
  num_sequences: 10
  seq_length: 2048
  tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
  split_by_shard: False
train:
  train_batch_size: 2
  num_training_steps: 500000
  num_warmup_steps: 2000
  initializer_range: 1.0e-2
  lr: 2.0e-4
  weight_decay: 1.0e-1
  # 加载预训练权重，从头训练设为null
  ckpt: "data/llama_raw_ckpt/7B/extended.pth"
  train_num_workers: 16
  gradient_accumulation_steps: 12
  prefetch_factor: 100
  train_and_eval: False
  gradient_checkpointing_enable: False
  use_lora: False
# global step
log_interval: 5
eval_interval: 500
save_interval: 1000
work_dir: "data/saved_ckpt/7B"
project_name: "Llama Pretrain"
-												add trainer and utils

											
										
										
											2023-04-12 09:59:05 +00:00
+								data:
-												using huggingface datasets to accelerate training, using open-llama to pretrain

											
										
										
											2023-04-24 11:13:53 +00:00
+								  mode: "pretrain"
 								  data:
-												update readme

											
										
										
											2023-04-28 07:01:01 +00:00
+								    wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
 								    # 由于加载了Llama模型的ckpt所以只使用少量英文数据
 								    the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
-												unified pre-training and instrcution-tuning both use train_lm and dataset

											
										
										
											2023-04-27 11:42:06 +00:00
+								  pad_to_max: False
-												update header config and add padding to concat_multiple_sequence

											
										
										
											2023-04-27 15:42:11 +00:00
+								  sequence_sample_mode: "none"
-												using huggingface datasets to accelerate training, using open-llama to pretrain

											
										
										
											2023-04-24 11:13:53 +00:00
+								  concat_multiple_sequence: True
-												update header config and add padding to concat_multiple_sequence

											
										
										
											2023-04-27 15:42:11 +00:00
+								  num_sequences: 10
-												using huggingface datasets to accelerate training, using open-llama to pretrain

											
										
										
											2023-04-24 11:13:53 +00:00
+								  seq_length: 2048
-												Optimized the structure of configs, added support for deepspeed stage3, reduced memory usage by using Auto class to load models, and added support for training 65B models.

											
										
										
											2023-05-06 15:37:17 +00:00
+								  tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
 								  split_by_shard: False
-												add trainer and utils

											
										
										
											2023-04-12 09:59:05 +00:00
+								train:
 								  train_batch_size: 2
-												update readme

											
										
										
											2023-04-28 07:01:01 +00:00
+								  num_training_steps: 500000
-												add trainer and utils

											
										
										
											2023-04-12 09:59:05 +00:00
+								  num_warmup_steps: 2000
 								  initializer_range: 1.0e-2
 								  lr: 2.0e-4
 								  weight_decay: 1.0e-1
-												update readme

											
										
										
											2023-04-28 07:01:01 +00:00
+								  # 加载预训练权重，从头训练设为null
-												unified pre-training and instrcution-tuning both use train_lm and dataset

											
										
										
											2023-04-27 11:42:06 +00:00
+								  ckpt: "data/llama_raw_ckpt/7B/extended.pth"
-												using huggingface datasets to accelerate training, using open-llama to pretrain

											
										
										
											2023-04-24 11:13:53 +00:00
+								  train_num_workers: 16
-												unified pre-training and instrcution-tuning both use train_lm and dataset

											
										
										
											2023-04-27 11:42:06 +00:00
+								  gradient_accumulation_steps: 12
-												update header config and add padding to concat_multiple_sequence

											
										
										
											2023-04-27 15:42:11 +00:00
+								  prefetch_factor: 100
-												Optimized the structure of configs, added support for deepspeed stage3, reduced memory usage by using Auto class to load models, and added support for training 65B models.

											
										
										
											2023-05-06 15:37:17 +00:00
+								  train_and_eval: False
-												update default config

											
										
										
											2023-05-09 07:16:50 +00:00
+								  gradient_checkpointing_enable: False
 								  use_lora: False
-												add trainer and utils

											
										
										
											2023-04-12 09:59:05 +00:00
+								# global step
 								log_interval: 5
-												update readme

											
										
										
											2023-04-28 07:01:01 +00:00
+								eval_interval: 500
 								save_interval: 1000
-												unified pre-training and instrcution-tuning both use train_lm and dataset

											
										
										
											2023-04-27 11:42:06 +00:00
+								work_dir: "data/saved_ckpt/7B"
-												add trainer and utils

											
										
										
											2023-04-12 09:59:05 +00:00
+								project_name: "Llama Pretrain"