From 59b79af9d7efbfb84b09c08686826855d9c7a82b Mon Sep 17 00:00:00 2001 From: LiangSong Date: Tue, 9 May 2023 16:53:05 +0800 Subject: [PATCH] add comment --- dataset/dataset.py | 2 ++ train_lm.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/dataset/dataset.py b/dataset/dataset.py index 421ee4c..b1071cc 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -182,6 +182,8 @@ def construct_dataset( assert len(data_files) > 0 all_data_files.extend(data_files) random.shuffle(all_data_files) + # 当shard可以被world_size整除时 split_dataset_by_node 会直接按shard进行划分,否则会读所有数据然后跳过一部分,可能会慢一点 + # https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.distributed.split_dataset_by_node if world_size is not None: num_shards = len(all_data_files) all_data_files = all_data_files[: num_shards // world_size * world_size] diff --git a/train_lm.py b/train_lm.py index 8648283..d1045b3 100644 --- a/train_lm.py +++ b/train_lm.py @@ -67,6 +67,8 @@ def main(argv): model_config = AutoConfig.from_pretrained(FLAGS.model_config) model_config.vocab_size = tokenizer.vocab_size model_config.pad_token_id = tokenizer.pad_token_id + # 使用AutoModel可以在Deepspeed.zero.Init()下正确的生效,而直接使用如OpenLlamaModel不能正确生效,导致浪费大量内存空间 + # https://github.com/huggingface/accelerate/pull/932 if config["train"]["ckpt"] is not None: raw_model = AutoModelForCausalLM.from_pretrained( config["train"]["ckpt"], config=model_config