diff --git a/data/download_instruct.sh b/data/download_instruct.sh new file mode 100644 index 0000000..b916bd6 --- /dev/null +++ b/data/download_instruct.sh @@ -0,0 +1,15 @@ +#!/bin/bash +### + # @Author: LiangSong(sl12160010@gmail.com) + # @Date: 2023-04-05 23:18:10 + # @LastEditors: LiangSong(sl12160010@gmail.com) + # @LastEditTime: 2023-04-05 23:34:30 + # @FilePath: /Open-Llama/data/download_instruct.sh + # @Description: + # + # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. +### +mkdir data/instruction_data +curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json +curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json +python3 data/preprocess_instruction.py \ No newline at end of file diff --git a/data/preprocess_instruction.py b/data/preprocess_instruction.py index 16cf8d1..6f6a2c4 100644 --- a/data/preprocess_instruction.py +++ b/data/preprocess_instruction.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-30 20:52:10 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-03-30 20:52:12 +LastEditTime: 2023-04-05 23:51:16 FilePath: /Open-Llama/data/preprocess_instruction.py Description: @@ -12,8 +12,11 @@ import json import zstandard as zstd from datasets import load_dataset + +root_dir = "data" + dataset = load_dataset("yizhongw/self_instruct") -write_path = "data/instruction_data/part-self_instruct-{}.jsonl.zst" +write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst" total_num = 0 file_num = 0 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") @@ -27,9 +30,14 @@ for line in dataset["train"]: wfp.write(b"\n") total_num += 1 wfp.close() +print( + "yizhongw/self_instruct preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) -dataset = load_dataset("BelleGroup/generated_train_0.5M_CN") -write_path = "data/instruction_data/part-belle_0.5M-{}.jsonl.zst" +dataset = load_dataset("BelleGroup/train_0.5M_CN") +write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst" total_num = 0 file_num = 0 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") @@ -43,9 +51,14 @@ for line in dataset["train"]: wfp.write(b"\n") total_num += 1 wfp.close() +print( + "BelleGroup/train_0.5M_CN preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) -dataset = load_dataset("BelleGroup/generated_train_1M_CN") -write_path = "data/instruction_data/part-belle_1M-{}.jsonl.zst" +dataset = load_dataset("BelleGroup/train_1M_CN") +write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst" total_num = 0 file_num = 0 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") @@ -59,3 +72,96 @@ for line in dataset["train"]: wfp.write(b"\n") total_num += 1 wfp.close() +print( + "BelleGroup/train_1M_CN preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) + +dataset = load_dataset("BelleGroup/school_math_0.25M") +write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst" +total_num = 0 +file_num = 0 +wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") +for line in dataset["train"]: + line = json.dumps(line) + if total_num % 1024 == 0 and total_num > 0: + file_num += 1 + wfp.close() + wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") + wfp.write(line.encode("utf-8")) + wfp.write(b"\n") + total_num += 1 +wfp.close() +print( + "BelleGroup/school_math_0.25M preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) + +dataset = load_dataset("BelleGroup/multiturn_chat_0.8M") +write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst" +total_num = 0 +file_num = 0 +wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") +for line in dataset["train"]: + line = json.dumps(line) + if total_num % 1024 == 0 and total_num > 0: + file_num += 1 + wfp.close() + wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") + wfp.write(line.encode("utf-8")) + wfp.write(b"\n") + total_num += 1 +wfp.close() +print( + "BelleGroup/multiturn_chat_0.8M preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) + +dataset = load_dataset("Graverman/Instruct-to-Code") +write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst" +total_num = 0 +file_num = 0 +wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") +for line in dataset["train"]: + line = json.dumps(line) + if total_num % 1024 == 0 and total_num > 0: + file_num += 1 + wfp.close() + wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") + wfp.write(line.encode("utf-8")) + wfp.write(b"\n") + total_num += 1 +wfp.close() +print( + "Graverman/Instruct-to-Code preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) + +write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst" +total_num = 0 +file_num = 0 +wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") +with open("data/sg_90k_part1.json", "r") as fp: + data1 = json.load(fp) +with open("data/sg_90k_part2.json", "r") as fp: + data2 = json.load(fp) +data = data1 + data2 +for line in data: + line = json.dumps(line) + if total_num % 1024 == 0 and total_num > 0: + file_num += 1 + wfp.close() + wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") + wfp.write(line.encode("utf-8")) + wfp.write(b"\n") + total_num += 1 +wfp.close() +print( + "RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format( + total_num, file_num + ) +) diff --git a/inctruction_tuning.py b/inctruction_tuning.py index 4e7ff6c..918d56e 100644 --- a/inctruction_tuning.py +++ b/inctruction_tuning.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-30 21:35:01 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-03-30 21:40:03 +LastEditTime: 2023-04-05 22:47:25 FilePath: /Open-Llama/inctruction_tuning.py Description: @@ -16,15 +16,14 @@ import random import sentencepiece as spm from torchinfo import summary from accelerate import Accelerator -from datasets import IterableDataset from torch.utils.data import DataLoader from deepspeed.ops.adam import FusedAdam from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup from dataset.validation import val_set from dataset.tokenizer import Tokenizer -from dataset.data_iter import create_shard_kwargs, create_data_iter -from dataset.data_loader import pretrain_collate_fn_gen +from dataset.data_iter import create_shard_kwargs, DataIter +from dataset.collate_fn import collate_fn_gen from dataset.instruction_dataset import ( preprocess_belle_gen, preprocess_self_instruction_gen, @@ -50,21 +49,20 @@ transform_dict = { "belle_0.5M": preprocess_belle_gen(tokenizer, max_length), "self_instruct": preprocess_self_instruction_gen(tokenizer, max_length), } -data_set = IterableDataset.from_generator( - create_data_iter, - gen_kwargs={ - "paths": paths, - "transform_dict": transform_dict, - "process_index": accelerator.process_index, - "num_processes": accelerator.num_processes, - }, +data_set = DataIter( + paths, + transform_dict=transform_dict, + concat_docs=True, + max_length=max_length, + process_index=accelerator.process_index, + num_processes=accelerator.num_processes, ) train_loader = DataLoader( data_set, batch_size=train_batch_size, # If num_workers is greater than 1, duplicate data may occur. num_workers=0, - collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), + collate_fn=collate_fn_gen(tokenizer, max_length), drop_last=True, ) # smaller initializer_range make training more stable diff --git a/pretrain_llama.py b/pretrain_llama.py index 17a48f8..18b92fb 100644 --- a/pretrain_llama.py +++ b/pretrain_llama.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-17 14:27:28 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-03-27 01:07:25 +LastEditTime: 2023-04-05 22:46:31 FilePath: /Open-Llama/pretrain_llama.py Description: pretrain GPT @@ -16,15 +16,14 @@ import random import sentencepiece as spm from torchinfo import summary from accelerate import Accelerator -from datasets import IterableDataset from torch.utils.data import DataLoader from deepspeed.ops.adam import FusedAdam from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup from dataset.validation import val_set from dataset.tokenizer import Tokenizer -from dataset.data_iter import create_shard_kwargs, create_data_iter -from dataset.data_loader import pretrain_collate_fn_gen +from dataset.data_iter import create_shard_kwargs, DataIter +from dataset.collate_fn import collate_fn_gen from dataset.pretrain_dataset import ( preprocess_the_pile_gen, preprocess_wudao_gen, @@ -49,21 +48,20 @@ transform_dict = { "wudao": preprocess_wudao_gen(tokenizer, max_length), "pile": preprocess_the_pile_gen(tokenizer, max_length), } -data_set = IterableDataset.from_generator( - create_data_iter, - gen_kwargs={ - "paths": paths, - "transform_dict": transform_dict, - "process_index": accelerator.process_index, - "num_processes": accelerator.num_processes, - }, +data_set = DataIter( + paths, + transform_dict=transform_dict, + concat_docs=True, + max_length=max_length, + process_index=accelerator.process_index, + num_processes=accelerator.num_processes, ) train_loader = DataLoader( data_set, batch_size=train_batch_size, # If num_workers is greater than 1, duplicate data may occur. num_workers=0, - collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), + collate_fn=collate_fn_gen(tokenizer, max_length), drop_last=True, ) # smaller initializer_range make training more stable