Open-Llama/data/preprocess_wudao.py

"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-16 22:10:44
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-26 22:59:55
FilePath: /Open-Llama/data/preprocess_wudao.py
Description:
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
making it easy for parallel training to perform streaming reads.
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from glob import glob
from tqdm import tqdm
import zstandard as zstd

paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
total_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
    with open(path, "r") as fp:
        data = json.load(fp)
    for line in data:
        if total_num % 16384 == 0 and total_num > 0:
            file_num += 1
            wfp.close()
            wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
        wfp.write(json.dumps(line).encode("utf-8"))
        wfp.write("\n".encode("utf-8"))
        total_num += 1
wfp.close()
print("total line: {}\ntotal files: {}".format(total_num, file_num))