update preprocess_instruction, add math/code/multiturn_chat and etc.
This commit is contained in:
parent
562067230f
commit
9f140dc99f
15
data/download_instruct.sh
Normal file
15
data/download_instruct.sh
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
#!/bin/bash
|
||||||
|
###
|
||||||
|
# @Author: LiangSong(sl12160010@gmail.com)
|
||||||
|
# @Date: 2023-04-05 23:18:10
|
||||||
|
# @LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
|
# @LastEditTime: 2023-04-05 23:34:30
|
||||||
|
# @FilePath: /Open-Llama/data/download_instruct.sh
|
||||||
|
# @Description:
|
||||||
|
#
|
||||||
|
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||||
|
###
|
||||||
|
mkdir data/instruction_data
|
||||||
|
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json
|
||||||
|
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json
|
||||||
|
python3 data/preprocess_instruction.py
|
|
@ -2,7 +2,7 @@
|
||||||
Author: LiangSong(sl12160010@gmail.com)
|
Author: LiangSong(sl12160010@gmail.com)
|
||||||
Date: 2023-03-30 20:52:10
|
Date: 2023-03-30 20:52:10
|
||||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
LastEditTime: 2023-03-30 20:52:12
|
LastEditTime: 2023-04-05 23:51:16
|
||||||
FilePath: /Open-Llama/data/preprocess_instruction.py
|
FilePath: /Open-Llama/data/preprocess_instruction.py
|
||||||
Description:
|
Description:
|
||||||
|
|
||||||
|
@ -12,8 +12,11 @@ import json
|
||||||
import zstandard as zstd
|
import zstandard as zstd
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
|
||||||
|
root_dir = "data"
|
||||||
|
|
||||||
dataset = load_dataset("yizhongw/self_instruct")
|
dataset = load_dataset("yizhongw/self_instruct")
|
||||||
write_path = "data/instruction_data/part-self_instruct-{}.jsonl.zst"
|
write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
|
||||||
total_num = 0
|
total_num = 0
|
||||||
file_num = 0
|
file_num = 0
|
||||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
@ -27,9 +30,14 @@ for line in dataset["train"]:
|
||||||
wfp.write(b"\n")
|
wfp.write(b"\n")
|
||||||
total_num += 1
|
total_num += 1
|
||||||
wfp.close()
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"yizhongw/self_instruct preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
dataset = load_dataset("BelleGroup/generated_train_0.5M_CN")
|
dataset = load_dataset("BelleGroup/train_0.5M_CN")
|
||||||
write_path = "data/instruction_data/part-belle_0.5M-{}.jsonl.zst"
|
write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
|
||||||
total_num = 0
|
total_num = 0
|
||||||
file_num = 0
|
file_num = 0
|
||||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
@ -43,9 +51,14 @@ for line in dataset["train"]:
|
||||||
wfp.write(b"\n")
|
wfp.write(b"\n")
|
||||||
total_num += 1
|
total_num += 1
|
||||||
wfp.close()
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"BelleGroup/train_0.5M_CN preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
dataset = load_dataset("BelleGroup/generated_train_1M_CN")
|
dataset = load_dataset("BelleGroup/train_1M_CN")
|
||||||
write_path = "data/instruction_data/part-belle_1M-{}.jsonl.zst"
|
write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
|
||||||
total_num = 0
|
total_num = 0
|
||||||
file_num = 0
|
file_num = 0
|
||||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
@ -59,3 +72,96 @@ for line in dataset["train"]:
|
||||||
wfp.write(b"\n")
|
wfp.write(b"\n")
|
||||||
total_num += 1
|
total_num += 1
|
||||||
wfp.close()
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"BelleGroup/train_1M_CN preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = load_dataset("BelleGroup/school_math_0.25M")
|
||||||
|
write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
|
||||||
|
total_num = 0
|
||||||
|
file_num = 0
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
for line in dataset["train"]:
|
||||||
|
line = json.dumps(line)
|
||||||
|
if total_num % 1024 == 0 and total_num > 0:
|
||||||
|
file_num += 1
|
||||||
|
wfp.close()
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
wfp.write(line.encode("utf-8"))
|
||||||
|
wfp.write(b"\n")
|
||||||
|
total_num += 1
|
||||||
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"BelleGroup/school_math_0.25M preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
|
||||||
|
write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
|
||||||
|
total_num = 0
|
||||||
|
file_num = 0
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
for line in dataset["train"]:
|
||||||
|
line = json.dumps(line)
|
||||||
|
if total_num % 1024 == 0 and total_num > 0:
|
||||||
|
file_num += 1
|
||||||
|
wfp.close()
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
wfp.write(line.encode("utf-8"))
|
||||||
|
wfp.write(b"\n")
|
||||||
|
total_num += 1
|
||||||
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"BelleGroup/multiturn_chat_0.8M preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = load_dataset("Graverman/Instruct-to-Code")
|
||||||
|
write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
|
||||||
|
total_num = 0
|
||||||
|
file_num = 0
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
for line in dataset["train"]:
|
||||||
|
line = json.dumps(line)
|
||||||
|
if total_num % 1024 == 0 and total_num > 0:
|
||||||
|
file_num += 1
|
||||||
|
wfp.close()
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
wfp.write(line.encode("utf-8"))
|
||||||
|
wfp.write(b"\n")
|
||||||
|
total_num += 1
|
||||||
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"Graverman/Instruct-to-Code preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
|
||||||
|
total_num = 0
|
||||||
|
file_num = 0
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
with open("data/sg_90k_part1.json", "r") as fp:
|
||||||
|
data1 = json.load(fp)
|
||||||
|
with open("data/sg_90k_part2.json", "r") as fp:
|
||||||
|
data2 = json.load(fp)
|
||||||
|
data = data1 + data2
|
||||||
|
for line in data:
|
||||||
|
line = json.dumps(line)
|
||||||
|
if total_num % 1024 == 0 and total_num > 0:
|
||||||
|
file_num += 1
|
||||||
|
wfp.close()
|
||||||
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
|
wfp.write(line.encode("utf-8"))
|
||||||
|
wfp.write(b"\n")
|
||||||
|
total_num += 1
|
||||||
|
wfp.close()
|
||||||
|
print(
|
||||||
|
"RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format(
|
||||||
|
total_num, file_num
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
Author: LiangSong(sl12160010@gmail.com)
|
Author: LiangSong(sl12160010@gmail.com)
|
||||||
Date: 2023-03-30 21:35:01
|
Date: 2023-03-30 21:35:01
|
||||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
LastEditTime: 2023-03-30 21:40:03
|
LastEditTime: 2023-04-05 22:47:25
|
||||||
FilePath: /Open-Llama/inctruction_tuning.py
|
FilePath: /Open-Llama/inctruction_tuning.py
|
||||||
Description:
|
Description:
|
||||||
|
|
||||||
|
@ -16,15 +16,14 @@ import random
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
from torchinfo import summary
|
from torchinfo import summary
|
||||||
from accelerate import Accelerator
|
from accelerate import Accelerator
|
||||||
from datasets import IterableDataset
|
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from deepspeed.ops.adam import FusedAdam
|
from deepspeed.ops.adam import FusedAdam
|
||||||
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
|
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
|
||||||
|
|
||||||
from dataset.validation import val_set
|
from dataset.validation import val_set
|
||||||
from dataset.tokenizer import Tokenizer
|
from dataset.tokenizer import Tokenizer
|
||||||
from dataset.data_iter import create_shard_kwargs, create_data_iter
|
from dataset.data_iter import create_shard_kwargs, DataIter
|
||||||
from dataset.data_loader import pretrain_collate_fn_gen
|
from dataset.collate_fn import collate_fn_gen
|
||||||
from dataset.instruction_dataset import (
|
from dataset.instruction_dataset import (
|
||||||
preprocess_belle_gen,
|
preprocess_belle_gen,
|
||||||
preprocess_self_instruction_gen,
|
preprocess_self_instruction_gen,
|
||||||
|
@ -50,21 +49,20 @@ transform_dict = {
|
||||||
"belle_0.5M": preprocess_belle_gen(tokenizer, max_length),
|
"belle_0.5M": preprocess_belle_gen(tokenizer, max_length),
|
||||||
"self_instruct": preprocess_self_instruction_gen(tokenizer, max_length),
|
"self_instruct": preprocess_self_instruction_gen(tokenizer, max_length),
|
||||||
}
|
}
|
||||||
data_set = IterableDataset.from_generator(
|
data_set = DataIter(
|
||||||
create_data_iter,
|
paths,
|
||||||
gen_kwargs={
|
transform_dict=transform_dict,
|
||||||
"paths": paths,
|
concat_docs=True,
|
||||||
"transform_dict": transform_dict,
|
max_length=max_length,
|
||||||
"process_index": accelerator.process_index,
|
process_index=accelerator.process_index,
|
||||||
"num_processes": accelerator.num_processes,
|
num_processes=accelerator.num_processes,
|
||||||
},
|
|
||||||
)
|
)
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
data_set,
|
data_set,
|
||||||
batch_size=train_batch_size,
|
batch_size=train_batch_size,
|
||||||
# If num_workers is greater than 1, duplicate data may occur.
|
# If num_workers is greater than 1, duplicate data may occur.
|
||||||
num_workers=0,
|
num_workers=0,
|
||||||
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
|
collate_fn=collate_fn_gen(tokenizer, max_length),
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
# smaller initializer_range make training more stable
|
# smaller initializer_range make training more stable
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
Author: LiangSong(sl12160010@gmail.com)
|
Author: LiangSong(sl12160010@gmail.com)
|
||||||
Date: 2023-03-17 14:27:28
|
Date: 2023-03-17 14:27:28
|
||||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
LastEditTime: 2023-03-27 01:07:25
|
LastEditTime: 2023-04-05 22:46:31
|
||||||
FilePath: /Open-Llama/pretrain_llama.py
|
FilePath: /Open-Llama/pretrain_llama.py
|
||||||
Description:
|
Description:
|
||||||
pretrain GPT
|
pretrain GPT
|
||||||
|
@ -16,15 +16,14 @@ import random
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
from torchinfo import summary
|
from torchinfo import summary
|
||||||
from accelerate import Accelerator
|
from accelerate import Accelerator
|
||||||
from datasets import IterableDataset
|
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from deepspeed.ops.adam import FusedAdam
|
from deepspeed.ops.adam import FusedAdam
|
||||||
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
|
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
|
||||||
|
|
||||||
from dataset.validation import val_set
|
from dataset.validation import val_set
|
||||||
from dataset.tokenizer import Tokenizer
|
from dataset.tokenizer import Tokenizer
|
||||||
from dataset.data_iter import create_shard_kwargs, create_data_iter
|
from dataset.data_iter import create_shard_kwargs, DataIter
|
||||||
from dataset.data_loader import pretrain_collate_fn_gen
|
from dataset.collate_fn import collate_fn_gen
|
||||||
from dataset.pretrain_dataset import (
|
from dataset.pretrain_dataset import (
|
||||||
preprocess_the_pile_gen,
|
preprocess_the_pile_gen,
|
||||||
preprocess_wudao_gen,
|
preprocess_wudao_gen,
|
||||||
|
@ -49,21 +48,20 @@ transform_dict = {
|
||||||
"wudao": preprocess_wudao_gen(tokenizer, max_length),
|
"wudao": preprocess_wudao_gen(tokenizer, max_length),
|
||||||
"pile": preprocess_the_pile_gen(tokenizer, max_length),
|
"pile": preprocess_the_pile_gen(tokenizer, max_length),
|
||||||
}
|
}
|
||||||
data_set = IterableDataset.from_generator(
|
data_set = DataIter(
|
||||||
create_data_iter,
|
paths,
|
||||||
gen_kwargs={
|
transform_dict=transform_dict,
|
||||||
"paths": paths,
|
concat_docs=True,
|
||||||
"transform_dict": transform_dict,
|
max_length=max_length,
|
||||||
"process_index": accelerator.process_index,
|
process_index=accelerator.process_index,
|
||||||
"num_processes": accelerator.num_processes,
|
num_processes=accelerator.num_processes,
|
||||||
},
|
|
||||||
)
|
)
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
data_set,
|
data_set,
|
||||||
batch_size=train_batch_size,
|
batch_size=train_batch_size,
|
||||||
# If num_workers is greater than 1, duplicate data may occur.
|
# If num_workers is greater than 1, duplicate data may occur.
|
||||||
num_workers=0,
|
num_workers=0,
|
||||||
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
|
collate_fn=collate_fn_gen(tokenizer, max_length),
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
# smaller initializer_range make training more stable
|
# smaller initializer_range make training more stable
|
||||||
|
|
Loading…
Reference in New Issue
Block a user