91 lines
2.8 KiB
Python
91 lines
2.8 KiB
Python
"""
|
|
Author: LiangSong(sl12160010@gmail.com)
|
|
Date: 2023-03-30 20:52:10
|
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
|
LastEditTime: 2023-05-04 08:32:04
|
|
FilePath: /Open-Llama/data/preprocess_instruction.py
|
|
Description:
|
|
|
|
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
|
"""
|
|
import json
|
|
from tqdm import tqdm
|
|
import zstandard as zstd
|
|
from datasets import load_dataset
|
|
|
|
|
|
root_dir = "data"
|
|
write_path = "data/instruction_data/part-{}-{}.jsonl.zst"
|
|
dataset_map = {
|
|
"yizhongw/self_instruct": "self_instruct",
|
|
"BelleGroup/train_0.5M_CN": "belle_0.5M",
|
|
"BelleGroup/train_1M_CN": "belle_1M",
|
|
"BelleGroup/train_2M_CN": "belle_2M",
|
|
"BelleGroup/school_math_0.25M": "belle_school_math_0.25M",
|
|
"BelleGroup/multiturn_chat_0.8M": "belle_multiturn_chat_0.8M",
|
|
"Graverman/Instruct-to-Code": "instruct_to_code",
|
|
"qwedsacf/grade-school-math-instructions": "grade_school_math",
|
|
"camel-ai/math": "camel_ai_math",
|
|
"camel-ai/physics": "camel_ai_physics",
|
|
"camel-ai/chemistry": "camel_ai_chemistry",
|
|
"camel-ai/biology": "camel_ai_biology",
|
|
("bigscience/xP3mt", "code"): "xP3mt_code",
|
|
("bigscience/xP3mt", "zh"): "xP3mt_zh",
|
|
}
|
|
|
|
|
|
def process_hf_dataset(name, local_name):
|
|
if isinstance(name, str):
|
|
dataset = load_dataset(name)
|
|
else:
|
|
dataset = load_dataset(*name)
|
|
total_num = 0
|
|
file_num = 1
|
|
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
|
|
for line in tqdm(dataset["train"]):
|
|
line = json.dumps(line)
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
file_num += 1
|
|
wfp.close()
|
|
wfp = zstd.open(
|
|
write_path.format(local_name, file_num), "wb", encoding="utf-8"
|
|
)
|
|
wfp.write(line.encode("utf-8"))
|
|
wfp.write(b"\n")
|
|
total_num += 1
|
|
wfp.close()
|
|
print(
|
|
"{} preprocess done. Total line: {}, Total file: {}".format(
|
|
name, total_num, file_num
|
|
)
|
|
)
|
|
|
|
|
|
for k, v in dataset_map.items():
|
|
process_hf_dataset(k, v)
|
|
|
|
local_name = "sharegpt_90K"
|
|
total_num = 0
|
|
file_num = 1
|
|
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
|
|
with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp:
|
|
data1 = json.load(fp)
|
|
with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp:
|
|
data2 = json.load(fp)
|
|
data = data1 + data2
|
|
for line in tqdm(data):
|
|
line = json.dumps(line)
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
file_num += 1
|
|
wfp.close()
|
|
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
|
|
wfp.write(line.encode("utf-8"))
|
|
wfp.write(b"\n")
|
|
total_num += 1
|
|
wfp.close()
|
|
print(
|
|
"anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format(
|
|
total_num, file_num
|
|
)
|
|
)
|