set dataset shuffle seed to 42
This commit is contained in:
parent
c2184c6dd1
commit
154456c976
|
|
@ -2,7 +2,7 @@
|
||||||
Author: LiangSong(sl12160010@gmail.com)
|
Author: LiangSong(sl12160010@gmail.com)
|
||||||
Date: 2023-04-24 20:05:21
|
Date: 2023-04-24 20:05:21
|
||||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
LastEditTime: 2023-04-27 22:19:37
|
LastEditTime: 2023-05-03 10:23:41
|
||||||
FilePath: /Open-Llama/dataset/dataset.py
|
FilePath: /Open-Llama/dataset/dataset.py
|
||||||
Description:
|
Description:
|
||||||
|
|
||||||
|
|
@ -162,7 +162,7 @@ def construct_dataset(dataset_config, tokenizer, return_raw_text=False):
|
||||||
"json", data_files=all_data_files, split="train", streaming=True
|
"json", data_files=all_data_files, split="train", streaming=True
|
||||||
)
|
)
|
||||||
# shuffle
|
# shuffle
|
||||||
dataset = dataset.shuffle()
|
dataset = dataset.shuffle(seed=42)
|
||||||
# 文本预处理转换为统一格式
|
# 文本预处理转换为统一格式
|
||||||
if dataset_config["mode"] == "pretrain":
|
if dataset_config["mode"] == "pretrain":
|
||||||
dataset = dataset.map(pretrain_transform, batched=True, batch_size=1)
|
dataset = dataset.map(pretrain_transform, batched=True, batch_size=1)
|
||||||
|
|
@ -244,7 +244,7 @@ def construct_dataset(dataset_config, tokenizer, return_raw_text=False):
|
||||||
full_dataset = full_dataset.map(get_labels_gen(tokenizer.pad_token_id))
|
full_dataset = full_dataset.map(get_labels_gen(tokenizer.pad_token_id))
|
||||||
|
|
||||||
# shuffle
|
# shuffle
|
||||||
full_dataset = full_dataset.shuffle()
|
full_dataset = full_dataset.shuffle(seed=42)
|
||||||
return full_dataset
|
return full_dataset
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user