Open-Llama/data/preprocess_the_pile.py

"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-16 22:35:38
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-26 22:59:38
FilePath: /Open-Llama/data/preprocess_the_pile.py
Description:
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
making it easy for parallel training to perform streaming reads.
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from glob import glob
from tqdm import tqdm
import zstandard as zstd

paths = glob("data/the_pile/*.jsonl.zst")
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
total_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
    with zstd.open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            if total_num % 16384 == 0 and total_num > 0:
                file_num += 1
                wfp.close()
                wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
            wfp.write(line.encode("utf-8"))
            total_num += 1
wfp.close()
print("total line: {}\ntotal files: {}".format(total_num, file_num))