2023-03-30 15:43:12 +00:00
|
|
|
"""
|
|
|
|
Author: LiangSong(sl12160010@gmail.com)
|
|
|
|
Date: 2023-03-30 20:52:10
|
|
|
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
2023-05-04 00:34:38 +00:00
|
|
|
LastEditTime: 2023-05-04 08:32:04
|
2023-03-30 15:43:12 +00:00
|
|
|
FilePath: /Open-Llama/data/preprocess_instruction.py
|
|
|
|
Description:
|
|
|
|
|
|
|
|
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
|
|
|
"""
|
|
|
|
import json
|
2023-05-05 09:05:41 +00:00
|
|
|
from tqdm import tqdm
|
2023-03-30 15:43:12 +00:00
|
|
|
import zstandard as zstd
|
|
|
|
from datasets import load_dataset
|
|
|
|
|
2023-04-05 15:51:56 +00:00
|
|
|
|
|
|
|
root_dir = "data"
|
|
|
|
|
2023-03-30 15:43:12 +00:00
|
|
|
dataset = load_dataset("yizhongw/self_instruct")
|
2023-04-05 15:51:56 +00:00
|
|
|
write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
|
2023-03-30 15:43:12 +00:00
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-03-30 15:43:12 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(dataset["train"]):
|
2023-03-30 15:43:12 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
2023-04-05 15:51:56 +00:00
|
|
|
print(
|
|
|
|
"yizhongw/self_instruct preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
2023-03-30 15:43:12 +00:00
|
|
|
|
2023-04-05 15:51:56 +00:00
|
|
|
dataset = load_dataset("BelleGroup/train_0.5M_CN")
|
|
|
|
write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
|
2023-03-30 15:43:12 +00:00
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-03-30 15:43:12 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(dataset["train"]):
|
2023-03-30 15:43:12 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
2023-04-05 15:51:56 +00:00
|
|
|
print(
|
|
|
|
"BelleGroup/train_0.5M_CN preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
2023-03-30 15:43:12 +00:00
|
|
|
|
2023-04-05 15:51:56 +00:00
|
|
|
dataset = load_dataset("BelleGroup/train_1M_CN")
|
|
|
|
write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
|
2023-03-30 15:43:12 +00:00
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-03-30 15:43:12 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(dataset["train"]):
|
2023-03-30 15:43:12 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
2023-04-05 15:51:56 +00:00
|
|
|
print(
|
|
|
|
"BelleGroup/train_1M_CN preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2023-05-05 09:05:41 +00:00
|
|
|
dataset = load_dataset("BelleGroup/train_2M_CN")
|
|
|
|
write_path = root_dir + "/instruction_data/part-belle_2M-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
|
|
|
file_num = 1
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
for line in tqdm(dataset["train"]):
|
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
|
|
|
"BelleGroup/train_2M_CN preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2023-04-05 15:51:56 +00:00
|
|
|
dataset = load_dataset("BelleGroup/school_math_0.25M")
|
|
|
|
write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-04-05 15:51:56 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(dataset["train"]):
|
2023-04-05 15:51:56 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
|
|
|
"BelleGroup/school_math_0.25M preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
|
|
|
|
write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-04-05 15:51:56 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(dataset["train"]):
|
2023-04-05 15:51:56 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
|
|
|
"BelleGroup/multiturn_chat_0.8M preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
dataset = load_dataset("Graverman/Instruct-to-Code")
|
|
|
|
write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-04-05 15:51:56 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(dataset["train"]):
|
2023-04-05 15:51:56 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
|
|
|
"Graverman/Instruct-to-Code preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2023-05-05 09:05:41 +00:00
|
|
|
# dataset = load_dataset("bigscience/xP3mt", "en")
|
|
|
|
# write_path = root_dir + "/instruction_data/part-bigscience/xP3mt_en-{}.jsonl.zst"
|
|
|
|
# total_num = 0
|
|
|
|
# file_num = 1
|
|
|
|
# wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
# for line in tqdm(dataset["train"]):
|
|
|
|
# line = json.dumps(line)
|
|
|
|
# if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
# file_num += 1
|
|
|
|
# wfp.close()
|
|
|
|
# wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
# wfp.write(line.encode("utf-8"))
|
|
|
|
# wfp.write(b"\n")
|
|
|
|
# total_num += 1
|
|
|
|
# wfp.close()
|
|
|
|
# print(
|
|
|
|
# "bigscience/xP3mt_en preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
# total_num, file_num
|
|
|
|
# )
|
|
|
|
# )
|
|
|
|
|
|
|
|
dataset = load_dataset("bigscience/xP3mt", "code")
|
|
|
|
write_path = root_dir + "/instruction_data/part-xP3mt_code-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
|
|
|
file_num = 1
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
for line in tqdm(dataset["train"]):
|
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
|
|
|
"bigscience/xP3mt_code preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
dataset = load_dataset("bigscience/xP3mt", "zh")
|
|
|
|
write_path = root_dir + "/instruction_data/part-xP3mt_zh-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
|
|
|
file_num = 1
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
for line in tqdm(dataset["train"]):
|
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
|
|
|
"bigscience/xP3mt_zh preprocess done. Total line: {}, Total file: {}".format(
|
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2023-04-05 15:51:56 +00:00
|
|
|
write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
|
|
|
|
total_num = 0
|
2023-04-24 11:13:53 +00:00
|
|
|
file_num = 1
|
2023-04-05 15:51:56 +00:00
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
2023-05-04 00:34:38 +00:00
|
|
|
with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp:
|
2023-04-05 15:51:56 +00:00
|
|
|
data1 = json.load(fp)
|
2023-05-04 00:34:38 +00:00
|
|
|
with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp:
|
2023-04-05 15:51:56 +00:00
|
|
|
data2 = json.load(fp)
|
|
|
|
data = data1 + data2
|
2023-05-05 09:05:41 +00:00
|
|
|
for line in tqdm(data):
|
2023-04-05 15:51:56 +00:00
|
|
|
line = json.dumps(line)
|
|
|
|
if total_num % 1024 == 0 and total_num > 0:
|
|
|
|
file_num += 1
|
|
|
|
wfp.close()
|
|
|
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
|
|
|
wfp.write(line.encode("utf-8"))
|
|
|
|
wfp.write(b"\n")
|
|
|
|
total_num += 1
|
|
|
|
wfp.close()
|
|
|
|
print(
|
2023-05-04 00:34:38 +00:00
|
|
|
"anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format(
|
2023-04-05 15:51:56 +00:00
|
|
|
total_num, file_num
|
|
|
|
)
|
|
|
|
)
|