add comment

This commit is contained in:
LiangSong 2023-05-09 16:53:05 +08:00
parent f6ac834ef9
commit 59b79af9d7
2 changed files with 4 additions and 0 deletions

View File

@ -182,6 +182,8 @@ def construct_dataset(
assert len(data_files) > 0
all_data_files.extend(data_files)
random.shuffle(all_data_files)
# 当shard可以被world_size整除时 split_dataset_by_node 会直接按shard进行划分否则会读所有数据然后跳过一部分可能会慢一点
# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.distributed.split_dataset_by_node
if world_size is not None:
num_shards = len(all_data_files)
all_data_files = all_data_files[: num_shards // world_size * world_size]

View File

@ -67,6 +67,8 @@ def main(argv):
model_config = AutoConfig.from_pretrained(FLAGS.model_config)
model_config.vocab_size = tokenizer.vocab_size
model_config.pad_token_id = tokenizer.pad_token_id
# 使用AutoModel可以在Deepspeed.zero.Init()下正确的生效而直接使用如OpenLlamaModel不能正确生效导致浪费大量内存空间
# https://github.com/huggingface/accelerate/pull/932
if config["train"]["ckpt"] is not None:
raw_model = AutoModelForCausalLM.from_pretrained(
config["train"]["ckpt"], config=model_config