update preprocess_instruction, add math/code/multiturn_chat and etc.

This commit is contained in:
LiangSong 2023-04-05 23:51:56 +08:00
parent 562067230f
commit 9f140dc99f
4 changed files with 149 additions and 32 deletions

15
data/download_instruct.sh Normal file
View File

@ -0,0 +1,15 @@
#!/bin/bash
###
# @Author: LiangSong(sl12160010@gmail.com)
# @Date: 2023-04-05 23:18:10
# @LastEditors: LiangSong(sl12160010@gmail.com)
# @LastEditTime: 2023-04-05 23:34:30
# @FilePath: /Open-Llama/data/download_instruct.sh
# @Description:
#
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
###
mkdir data/instruction_data
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json
python3 data/preprocess_instruction.py

View File

@ -2,7 +2,7 @@
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-30 20:52:10
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-30 20:52:12
LastEditTime: 2023-04-05 23:51:16
FilePath: /Open-Llama/data/preprocess_instruction.py
Description:
@ -12,8 +12,11 @@ import json
import zstandard as zstd
from datasets import load_dataset
root_dir = "data"
dataset = load_dataset("yizhongw/self_instruct")
write_path = "data/instruction_data/part-self_instruct-{}.jsonl.zst"
write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
@ -27,9 +30,14 @@ for line in dataset["train"]:
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"yizhongw/self_instruct preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("BelleGroup/generated_train_0.5M_CN")
write_path = "data/instruction_data/part-belle_0.5M-{}.jsonl.zst"
dataset = load_dataset("BelleGroup/train_0.5M_CN")
write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
@ -43,9 +51,14 @@ for line in dataset["train"]:
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/train_0.5M_CN preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("BelleGroup/generated_train_1M_CN")
write_path = "data/instruction_data/part-belle_1M-{}.jsonl.zst"
dataset = load_dataset("BelleGroup/train_1M_CN")
write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
@ -59,3 +72,96 @@ for line in dataset["train"]:
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/train_1M_CN preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("BelleGroup/school_math_0.25M")
write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/school_math_0.25M preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/multiturn_chat_0.8M preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("Graverman/Instruct-to-Code")
write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"Graverman/Instruct-to-Code preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
with open("data/sg_90k_part1.json", "r") as fp:
data1 = json.load(fp)
with open("data/sg_90k_part2.json", "r") as fp:
data2 = json.load(fp)
data = data1 + data2
for line in data:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)

View File

@ -2,7 +2,7 @@
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-30 21:35:01
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-30 21:40:03
LastEditTime: 2023-04-05 22:47:25
FilePath: /Open-Llama/inctruction_tuning.py
Description:
@ -16,15 +16,14 @@ import random
import sentencepiece as spm
from torchinfo import summary
from accelerate import Accelerator
from datasets import IterableDataset
from torch.utils.data import DataLoader
from deepspeed.ops.adam import FusedAdam
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
from dataset.validation import val_set
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, create_data_iter
from dataset.data_loader import pretrain_collate_fn_gen
from dataset.data_iter import create_shard_kwargs, DataIter
from dataset.collate_fn import collate_fn_gen
from dataset.instruction_dataset import (
preprocess_belle_gen,
preprocess_self_instruction_gen,
@ -50,21 +49,20 @@ transform_dict = {
"belle_0.5M": preprocess_belle_gen(tokenizer, max_length),
"self_instruct": preprocess_self_instruction_gen(tokenizer, max_length),
}
data_set = IterableDataset.from_generator(
create_data_iter,
gen_kwargs={
"paths": paths,
"transform_dict": transform_dict,
"process_index": accelerator.process_index,
"num_processes": accelerator.num_processes,
},
data_set = DataIter(
paths,
transform_dict=transform_dict,
concat_docs=True,
max_length=max_length,
process_index=accelerator.process_index,
num_processes=accelerator.num_processes,
)
train_loader = DataLoader(
data_set,
batch_size=train_batch_size,
# If num_workers is greater than 1, duplicate data may occur.
num_workers=0,
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
collate_fn=collate_fn_gen(tokenizer, max_length),
drop_last=True,
)
# smaller initializer_range make training more stable

View File

@ -2,7 +2,7 @@
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 14:27:28
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-27 01:07:25
LastEditTime: 2023-04-05 22:46:31
FilePath: /Open-Llama/pretrain_llama.py
Description:
pretrain GPT
@ -16,15 +16,14 @@ import random
import sentencepiece as spm
from torchinfo import summary
from accelerate import Accelerator
from datasets import IterableDataset
from torch.utils.data import DataLoader
from deepspeed.ops.adam import FusedAdam
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
from dataset.validation import val_set
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, create_data_iter
from dataset.data_loader import pretrain_collate_fn_gen
from dataset.data_iter import create_shard_kwargs, DataIter
from dataset.collate_fn import collate_fn_gen
from dataset.pretrain_dataset import (
preprocess_the_pile_gen,
preprocess_wudao_gen,
@ -49,21 +48,20 @@ transform_dict = {
"wudao": preprocess_wudao_gen(tokenizer, max_length),
"pile": preprocess_the_pile_gen(tokenizer, max_length),
}
data_set = IterableDataset.from_generator(
create_data_iter,
gen_kwargs={
"paths": paths,
"transform_dict": transform_dict,
"process_index": accelerator.process_index,
"num_processes": accelerator.num_processes,
},
data_set = DataIter(
paths,
transform_dict=transform_dict,
concat_docs=True,
max_length=max_length,
process_index=accelerator.process_index,
num_processes=accelerator.num_processes,
)
train_loader = DataLoader(
data_set,
batch_size=train_batch_size,
# If num_workers is greater than 1, duplicate data may occur.
num_workers=0,
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
collate_fn=collate_fn_gen(tokenizer, max_length),
drop_last=True,
)
# smaller initializer_range make training more stable