35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
"""
|
|
Author: LiangSong(sl12160010@gmail.com)
|
|
Date: 2023-03-16 22:10:44
|
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
|
LastEditTime: 2023-03-26 22:59:55
|
|
FilePath: /Open-Llama/data/preprocess_wudao.py
|
|
Description:
|
|
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
|
|
making it easy for parallel training to perform streaming reads.
|
|
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
|
"""
|
|
import json
|
|
from glob import glob
|
|
from tqdm import tqdm
|
|
import zstandard as zstd
|
|
|
|
paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
|
|
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
|
|
total_num = 0
|
|
file_num = 1
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
for path in tqdm(paths, total=len(paths)):
|
|
with open(path, "r") as fp:
|
|
data = json.load(fp)
|
|
for line in data:
|
|
if total_num % 16384 == 0 and total_num > 0:
|
|
file_num += 1
|
|
wfp.close()
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
wfp.write(json.dumps(line).encode("utf-8"))
|
|
wfp.write("\n".encode("utf-8"))
|
|
total_num += 1
|
|
wfp.close()
|
|
print("total line: {}\ntotal files: {}".format(total_num, file_num))
|