33 lines
1.2 KiB
Python
33 lines
1.2 KiB
Python
"""
|
|
Author: LiangSong(sl12160010@gmail.com)
|
|
Date: 2023-03-16 22:35:38
|
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
|
LastEditTime: 2023-03-26 22:59:38
|
|
FilePath: /Open-Llama/data/preprocess_the_pile.py
|
|
Description:
|
|
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
|
|
making it easy for parallel training to perform streaming reads.
|
|
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
|
"""
|
|
import json
|
|
from glob import glob
|
|
from tqdm import tqdm
|
|
import zstandard as zstd
|
|
|
|
paths = glob("data/the_pile/*.jsonl.zst")
|
|
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
|
|
total_num = 0
|
|
file_num = 1
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
for path in tqdm(paths, total=len(paths)):
|
|
with zstd.open(path, "r", encoding="utf-8") as fp:
|
|
for line in fp:
|
|
if total_num % 16384 == 0 and total_num > 0:
|
|
file_num += 1
|
|
wfp.close()
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
wfp.write(line.encode("utf-8"))
|
|
total_num += 1
|
|
wfp.close()
|
|
print("total line: {}\ntotal files: {}".format(total_num, file_num))
|