Open-Llama/data/preprocess_instruction.py
2023-05-17 22:21:46 +07:00

91 lines
2.8 KiB
Python

"""
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-03-30 20:52:10
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-04 08:32:04
FilePath: /Open-Llama/data/preprocess_instruction.py
Description:
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from tqdm import tqdm
import zstandard as zstd
from datasets import load_dataset
root_dir = "data"
write_path = "data/instruction_data/part-{}-{}.jsonl.zst"
dataset_map = {
"yizhongw/self_instruct": "self_instruct",
"BelleGroup/train_0.5M_CN": "belle_0.5M",
"BelleGroup/train_1M_CN": "belle_1M",
"BelleGroup/train_2M_CN": "belle_2M",
"BelleGroup/school_math_0.25M": "belle_school_math_0.25M",
"BelleGroup/multiturn_chat_0.8M": "belle_multiturn_chat_0.8M",
"Graverman/Instruct-to-Code": "instruct_to_code",
"qwedsacf/grade-school-math-instructions": "grade_school_math",
"camel-ai/math": "camel_ai_math",
"camel-ai/physics": "camel_ai_physics",
"camel-ai/chemistry": "camel_ai_chemistry",
"camel-ai/biology": "camel_ai_biology",
("bigscience/xP3mt", "code"): "xP3mt_code",
("bigscience/xP3mt", "zh"): "xP3mt_zh",
}
def process_hf_dataset(name, local_name):
if isinstance(name, str):
dataset = load_dataset(name)
else:
dataset = load_dataset(*name)
total_num = 0
file_num = 1
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
for line in tqdm(dataset["train"]):
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(
write_path.format(local_name, file_num), "wb", encoding="utf-8"
)
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"{} preprocess done. Total line: {}, Total file: {}".format(
name, total_num, file_num
)
)
for k, v in dataset_map.items():
process_hf_dataset(k, v)
local_name = "sharegpt_90K"
total_num = 0
file_num = 1
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp:
data1 = json.load(fp)
with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp:
data2 = json.load(fp)
data = data1 + data2
for line in tqdm(data):
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)