reformat code with black

This commit is contained in:
LiangSong 2023-03-27 14:34:59 +08:00
parent 8dd47aee90
commit 918a8cdc3d
10 changed files with 346 additions and 231 deletions

View File

@ -5,12 +5,10 @@ num_warmup_steps = 2000
initializer_range = 1e-2
lr = 2e-4
weight_decay = 1e-1
tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model'
patterns = [
'data/pretrain_data/part-*.jsonl.zst'
]
tokenizer_model_path = "configs/10w_vocab_wudao5_pile10.model"
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
# global step
log_interval = 5
eval_interval = 200
save_interval = 800
work_dir = 'data/saved_ckpt/'
work_dir = "data/saved_ckpt/"

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-16 22:35:38
LastEditors: LiangSong(sl12160010@gmail.com)
@ -8,25 +8,25 @@ Description:
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
making it easy for parallel training to perform streaming reads.
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import json
from glob import glob
from tqdm import tqdm
import zstandard as zstd
paths = glob('data/the_pile/*.jsonl.zst')
write_path = 'data/pretrain_data/part-pile-{}.jsonl.zst'
paths = glob("data/the_pile/*.jsonl.zst")
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
with zstd.open(path, 'r', encoding='utf-8') as fp:
with zstd.open(path, "r", encoding="utf-8") as fp:
for line in fp:
if total_num % 16384 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
wfp.write(line.encode('utf-8'))
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
total_num += 1
wfp.close()
print('total line: {}\ntotal files: {}'.format(total_num, file_num))
print("total line: {}\ntotal files: {}".format(total_num, file_num))

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-16 22:10:44
LastEditors: LiangSong(sl12160010@gmail.com)
@ -8,27 +8,27 @@ Description:
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
making it easy for parallel training to perform streaming reads.
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import json
from glob import glob
from tqdm import tqdm
import zstandard as zstd
paths = glob('data/WuDaoCorpus2.0_base_200G/part*')
write_path = 'data/pretrain_data/part-wudao-{}.jsonl.zst'
paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
with open(path, 'r') as fp:
with open(path, "r") as fp:
data = json.load(fp)
for line in data:
if total_num % 16384 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
wfp.write(json.dumps(line).encode('utf-8'))
wfp.write('\n'.encode('utf-8'))
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(json.dumps(line).encode("utf-8"))
wfp.write("\n".encode("utf-8"))
total_num += 1
wfp.close()
print('total line: {}\ntotal files: {}'.format(total_num, file_num))
print("total line: {}\ntotal files: {}".format(total_num, file_num))

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 19:32:20
LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,50 +7,32 @@ FilePath: /Open-Llama/dataset/data_iter.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import json
from glob import glob
import zstandard as zstd
def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1):
'''
Currently, the allowed storage formats are jsonl and jsonl.zst.
"""
Currently, the allowed storage formats are jsonl and jsonl.zst.
Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
'''
"""
past = None
for i, path in paths:
dataset_name = path.split('-')[-2]
dataset_name = path.split("-")[-2]
if past != dataset_name:
print('Loading data from {}'.format(path))
print("Loading data from {}".format(path))
past = path
if num_processes > 1 and i % num_processes != process_index:
continue
if path.endswith('jsonl.zst'):
with zstd.open(path, 'r', encoding='utf-8') as fp:
for line in fp:
if isinstance(line, bytes):
line = line.decode('utf-8')
line = json.loads(line)
line['dataset'] = dataset_name
if transform_dict:
line = transform_dict[dataset_name](line)
if isinstance(line, str):
yield line
elif isinstance(line, list):
for i in line:
yield i
else:
raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
else:
yield line
elif path.endswith('jsonl'):
with open(path, 'r') as fp:
if path.endswith("jsonl.zst"):
with zstd.open(path, "r", encoding="utf-8") as fp:
for line in fp:
if isinstance(line, bytes):
line = line.decode('utf-8')
line = line.decode("utf-8")
line = json.loads(line)
line['dataset'] = dataset_name
line["dataset"] = dataset_name
if transform_dict:
line = transform_dict[dataset_name](line)
if isinstance(line, str):
@ -59,34 +41,57 @@ def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=
for i in line:
yield i
else:
raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
raise Exception(
"Unsupported type in Transformation: {}".format(
transform_dict[dataset_name]
)
)
else:
yield line
elif path.endswith("jsonl"):
with open(path, "r") as fp:
for line in fp:
if isinstance(line, bytes):
line = line.decode("utf-8")
line = json.loads(line)
line["dataset"] = dataset_name
if transform_dict:
line = transform_dict[dataset_name](line)
if isinstance(line, str):
yield line
elif isinstance(line, list):
for i in line:
yield i
else:
raise Exception(
"Unsupported type in Transformation: {}".format(
transform_dict[dataset_name]
)
)
else:
yield line
else:
raise Exception('File format of {} is not supported yet.'.format(path))
raise Exception("File format of {} is not supported yet.".format(path))
def create_shard_kwargs(patterns, repeat=1):
'''
Assign numbers to different shards of data to ensure that data is not duplicated
"""
Assign numbers to different shards of data to ensure that data is not duplicated
when allocated to different nodes during distributed training.
'''
"""
all_path = []
for p in patterns:
all_path.extend(glob(p))
all_path *= repeat
return [(i, p) for i, p in enumerate(all_path)]
if __name__ == '__main__':
patterns = [
'data/pretrain_data/part-wudao*.jsonl.zst'
]
if __name__ == "__main__":
patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
transform_dict = {
'wudao': lambda x: x['title'],
'pile': lambda x: [x['text']]
}
transform_dict = {"wudao": lambda x: x["title"], "pile": lambda x: [x["text"]]}
data_iter = create_data_iter(paths, transform_dict=transform_dict)
for i, data in enumerate(data_iter):
print(i, data)
if i == 20:
break
break

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 20:41:25
LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,76 +7,95 @@ FilePath: /Open-Llama/dataset/pretrain_dataset.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import math
import torch
def preprocess_wudao_gen(tokenizer, segment_max_length=1024):
def preprocess_wudao(line):
'''
"""
The format of the data is roughly as follows.
{'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
Split the data based on the tokenized length according to the maximum length.
'''
total = line['title'] + '\n' + line['content']
"""
total = line["title"] + "\n" + line["content"]
out = tokenizer(total)
input_ids = out['input_ids']
return [input_ids[i*segment_max_length: (i+1)*segment_max_length]
for i in range(math.ceil(len(input_ids)/segment_max_length))]
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_wudao
def preprocess_the_pile_gen(tokenizer, segment_max_length=1024):
def preprocess_the_pile(line):
'''
"""
The format of the data is roughly as follows.
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
Split the data based on the tokenized length according to the maximum length.
'''
total = line['text']
"""
total = line["text"]
out = tokenizer(total)
input_ids = out['input_ids']
return [input_ids[i*segment_max_length: (i+1)*segment_max_length]
for i in range(math.ceil(len(input_ids)/segment_max_length))]
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_the_pile
def pretrain_collate_fn_gen(tokenizer, segment_max_length=1024):
'''
"""
Organize data into tensors by padding based on the preset maximum length.
'''
"""
pad_id = tokenizer.pad_id
def pretrain_collate_fn(batch):
input_ids = []
for i in batch:
input_len = len(i)
input_ids.append(i+[pad_id]*(segment_max_length-input_len))
input_ids.append(i + [pad_id] * (segment_max_length - input_len))
inputs = {
'input_ids': torch.tensor(input_ids, dtype=torch.int64),
"input_ids": torch.tensor(input_ids, dtype=torch.int64),
}
return inputs
return pretrain_collate_fn
if __name__ == '__main__':
if __name__ == "__main__":
import sentencepiece as spm
from datasets import IterableDataset
from torch.utils.data import DataLoader
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, create_data_iter
sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
patterns = [
'data/pretrain_data/part-*.jsonl.zst'
]
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
transform_dict = {
'wudao': preprocess_wudao_gen(tokenizer),
'pile': preprocess_the_pile_gen(tokenizer)
"wudao": preprocess_wudao_gen(tokenizer),
"pile": preprocess_the_pile_gen(tokenizer),
}
data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={'paths': paths, 'transform_dict': transform_dict})
train_loader = DataLoader(data_set, batch_size=8, num_workers=4,
collate_fn=pretrain_collate_fn_gen(tokenizer), drop_last=True)
data_set = IterableDataset.from_generator(
create_data_iter, gen_kwargs={"paths": paths, "transform_dict": transform_dict}
)
train_loader = DataLoader(
data_set,
batch_size=8,
num_workers=4,
collate_fn=pretrain_collate_fn_gen(tokenizer),
drop_last=True,
)
for batch in train_loader:
for k, v in batch.items():
print(k, v.shape)
break
break

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-20 21:39:47
LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,9 +7,10 @@ FilePath: /Open-Llama/dataset/tokenizer.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import torch
class Tokenizer:
def __init__(self, sp_model):
self.sp_model = sp_model
@ -18,90 +19,130 @@ class Tokenizer:
self.pad_id = self.sp_model.pad_id()
self.vocab_size = self.sp_model.vocab_size()
def __call__(self, inputs, padding=None, max_length=256, return_tensors=False, truncation=False,
add_special_tokens=True, return_mask=False):
def __call__(
self,
inputs,
padding=None,
max_length=256,
return_tensors=False,
truncation=False,
add_special_tokens=True,
return_mask=False,
):
if isinstance(inputs, str):
return self.encode(inputs, padding=padding, max_length=max_length,
return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
return self.encode(
inputs,
padding=padding,
max_length=max_length,
return_tensors=return_tensors,
truncation=truncation,
add_special_tokens=add_special_tokens,
return_mask=return_mask,
)
else:
return self.encode_batch(inputs, padding=padding, max_length=max_length,
return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
return self.encode_batch(
inputs,
padding=padding,
max_length=max_length,
return_tensors=return_tensors,
truncation=truncation,
add_special_tokens=add_special_tokens,
return_mask=return_mask,
)
def encode(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False,
add_special_tokens=True, return_mask=False):
assert(isinstance(inputs, str))
def encode(
self,
inputs,
padding=None,
max_length=8192,
return_tensors=False,
truncation=False,
add_special_tokens=True,
return_mask=False,
):
assert isinstance(inputs, str)
input_ids = self.sp_model.Encode(inputs)
if return_mask:
attention_mask = [1] * len(input_ids)
if truncation:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780
# 参考Transformer中的实现 默认最后一位一定是pad或者eos
input_ids = input_ids[: max_length-1]
input_ids = input_ids[: max_length - 1]
if return_mask:
attention_mask = attention_mask[: max_length-1]
attention_mask = attention_mask[: max_length - 1]
if add_special_tokens:
input_ids = input_ids + [self.eos_id]
if return_mask:
attention_mask = attention_mask + [0]
if padding == 'max_length':
input_ids = input_ids + [self.pad_id] * (max_length-len(input_ids))
if padding == "max_length":
input_ids = input_ids + [self.pad_id] * (max_length - len(input_ids))
if return_mask:
attention_mask = attention_mask + [0] * (max_length-len(attention_mask))
attention_mask = attention_mask + [0] * (
max_length - len(attention_mask)
)
if return_tensors:
input_ids = torch.tensor([input_ids])
out = {
'input_ids': input_ids,
"input_ids": input_ids,
}
if return_mask:
attention_mask = torch.tensor([attention_mask])
out['attention_mask'] = attention_mask
out["attention_mask"] = attention_mask
else:
out = {
'input_ids': input_ids,
"input_ids": input_ids,
}
if return_mask:
out['attention_mask'] = attention_mask
out["attention_mask"] = attention_mask
return out
def encode_batch(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False,
add_special_tokens=True, return_mask=False):
def encode_batch(
self,
inputs,
padding=None,
max_length=8192,
return_tensors=False,
truncation=False,
add_special_tokens=True,
return_mask=False,
):
input_ids = self.sp_model.Encode(inputs)
if return_mask:
attention_mask = [[1] * len(i) for i in input_ids]
if truncation:
input_ids = [i[: max_length-1] for i in input_ids]
input_ids = [i[: max_length - 1] for i in input_ids]
if return_mask:
attention_mask = [i[: max_length-1] for i in attention_mask]
attention_mask = [i[: max_length - 1] for i in attention_mask]
if add_special_tokens:
input_ids = [i+[self.eos_id] for i in input_ids]
input_ids = [i + [self.eos_id] for i in input_ids]
if return_mask:
attention_mask = [i+[0] for i in attention_mask]
if padding == 'max_length':
attention_mask = [i + [0] for i in attention_mask]
if padding == "max_length":
input_ids_pad = []
if return_mask:
attention_mask_pad = []
for idx, i in enumerate(input_ids):
input_ids_pad.append(i + [self.pad_id] * (max_length-len(i)))
input_ids_pad.append(i + [self.pad_id] * (max_length - len(i)))
if return_mask:
j = attention_mask[idx]
attention_mask_pad.append(j + [0] * (max_length-len(j)))
attention_mask_pad.append(j + [0] * (max_length - len(j)))
input_ids = input_ids_pad
if return_mask:
attention_mask = attention_mask_pad
if return_tensors:
input_ids = torch.tensor(input_ids)
out = {
'input_ids': input_ids,
"input_ids": input_ids,
}
if return_mask:
attention_mask = torch.tensor(attention_mask)
out['attention_mask'] = attention_mask
out["attention_mask"] = attention_mask
else:
out = {
'input_ids': input_ids,
"input_ids": input_ids,
}
if return_mask:
out['attention_mask'] = attention_mask
out["attention_mask"] = attention_mask
return out
def decode(self, inputs):
@ -110,37 +151,48 @@ class Tokenizer:
for i in inputs:
if self.eos_id in i:
eos_idx = i.index(self.eos_id)
i = i[: eos_idx]
i = i[:eos_idx]
out.append(i)
out = self.sp_model.Decode(out)
return out
if __name__ == '__main__':
if __name__ == "__main__":
import sentencepiece as spm
from unicodedata import normalize
# Using sentencepiece may not be able to process some reserved keywords like '▁'.
sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
tmp = ['hello world',
'这是开源项目的V1版本this is the first version of a open-source project!',
'# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)']
tmp = [
"hello world",
"这是开源项目的V1版本this is the first version of a open-source project!",
"# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)",
]
print(tmp)
out = tokenizer(tmp, padding='max_length', return_tensors=True, max_length=64, truncation=True)
out = tokenizer(
tmp, padding="max_length", return_tensors=True, max_length=64, truncation=True
)
for k, v in out.items():
print(k, v.shape)
print(out['input_ids'])
out = tokenizer.decode(out['input_ids'])
print(out["input_ids"])
out = tokenizer.decode(out["input_ids"])
print(out)
for i, j in zip(tmp, out):
assert(normalize('NFKC', i) == j)
assert normalize("NFKC", i) == j
from dataset.data_iter import create_shard_kwargs, create_data_iter
patterns = [
'data/pretrain_data/part-wudao*.jsonl.zst'
]
patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
data_iter = create_data_iter(paths)
for i, data in enumerate(data_iter):
assert(normalize('NFKC', data['content']) == sp_model.Decode(sp_model.Encode(data['content'])) or '' in data['content'])
assert (
normalize("NFKC", data["content"])
== sp_model.Decode(sp_model.Encode(data["content"]))
or "" in data["content"]
)
if i == 1000:
break
break

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-24 20:49:03
LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,25 +7,25 @@ FilePath: /Open-Llama/dataset/train_tokenizer.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import random
from dataset.data_iter import create_data_iter, create_shard_kwargs
wudao_patterns = [
'data/pretrain_data/part-wudao-*.jsonl.zst',
"data/pretrain_data/part-wudao-*.jsonl.zst",
]
wudao_paths = create_shard_kwargs(wudao_patterns)
random.shuffle(wudao_paths)
pile_patterns = [
'data/pretrain_data/part-pile-*.jsonl.zst',
"data/pretrain_data/part-pile-*.jsonl.zst",
]
pile_paths = create_shard_kwargs(pile_patterns)
random.shuffle(pile_paths)
paths = wudao_paths[: 5] + pile_paths[: 10]
paths = wudao_paths[:5] + pile_paths[:10]
transform_dict = {
'wudao': lambda line: [(line['title'] + '\n' + line['content'])],
'pile': lambda line: [line['text']]
"wudao": lambda line: [(line["title"] + "\n" + line["content"])],
"pile": lambda line: [line["text"]],
}
data_iter = create_data_iter(paths, transform_dict)
@ -35,19 +35,30 @@ import sentencepiece as spm
# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
sentence_iterator=data_iter, model_writer=model, shuffle_input_sentence=False, train_extremely_large_corpus=True,
# hyperparameters of tokenizer
max_sentence_length=16384, pad_id=3, model_type='BPE', vocab_size=100000,
# split digits and fallback to byte same as Llama.
# set split_by_unicode_script to True to avoid grouping punctuation and characters together.
split_digits=True, split_by_unicode_script=True, byte_fallback=True,
# reserve whitespace and \n and \t etc. for code generation
allow_whitespace_only_pieces=True, remove_extra_whitespaces=False, normalization_rule_name='nfkc')
sentence_iterator=data_iter,
model_writer=model,
shuffle_input_sentence=False,
train_extremely_large_corpus=True,
# hyperparameters of tokenizer
max_sentence_length=16384,
pad_id=3,
model_type="BPE",
vocab_size=100000,
# split digits and fallback to byte same as Llama.
# set split_by_unicode_script to True to avoid grouping punctuation and characters together.
split_digits=True,
split_by_unicode_script=True,
byte_fallback=True,
# reserve whitespace and \n and \t etc. for code generation
allow_whitespace_only_pieces=True,
remove_extra_whitespaces=False,
normalization_rule_name="nfkc",
)
# Serialize the model as file.
with open('configs/10w_vocab_wudao5_pile10.model', 'wb') as f:
f.write(model.getvalue())
with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f:
f.write(model.getvalue())
# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.decode(sp.encode('只因你太美🤗▃ \n 1')))
print(sp.decode(sp.encode("只因你太美🤗▃ \n 1")))

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-18 00:06:41
LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,21 +7,21 @@ FilePath: /Open-Llama/dataset/validation.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
val_set = [
'白日依山尽,',
'君不见,黄河之水天上来,奔流到海不复回。君不见,',
'秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。',
'古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?',
'当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。',
'这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,',
'人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如',
'In recent years, there has been increasing interest in the use of machine learning to',
'已知三个数分别为1, 2, 3则它们的平均数是',
'小明总共有15个苹果他分别给了3个人两个苹果然后自己又吃了一个苹果那么它还剩几个苹果',
'根据牛顿第二定律,物体的加速度等于',
'碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳',
'下面是一段用python写的快速排序的代码:',
'The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on',
'下面是一个使用 PyTorch 和 Transformer 的示例代码用于训练一个文本分类模型import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset'
]
"白日依山尽,",
"君不见,黄河之水天上来,奔流到海不复回。君不见,",
"秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。",
"古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?",
"当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。",
"这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,",
"人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如",
"In recent years, there has been increasing interest in the use of machine learning to",
"已知三个数分别为1, 2, 3则它们的平均数是",
"小明总共有15个苹果他分别给了3个人两个苹果然后自己又吃了一个苹果那么它还剩几个苹果",
"根据牛顿第二定律,物体的加速度等于",
"碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳",
"下面是一段用python写的快速排序的代码:",
"The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on",
"下面是一个使用 PyTorch 和 Transformer 的示例代码用于训练一个文本分类模型import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset",
]

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 13:21:33
LastEditors: LiangSong(sl12160010@gmail.com)
@ -9,4 +9,4 @@ Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf
Performance and effectiveness optimization based on the implementation in the Transformer library.
https://github.com/Bayes-Song/transformers
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""

View File

@ -1,4 +1,4 @@
'''
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 14:27:28
LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,7 +7,7 @@ FilePath: /Open-Llama/pretrain_llama.py
Description:
pretrain GPT
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
"""
import os
import time
import wandb
@ -24,15 +24,17 @@ from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with
from dataset.validation import val_set
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, create_data_iter
from dataset.pretrain_dataset import preprocess_the_pile_gen, preprocess_wudao_gen, pretrain_collate_fn_gen
from dataset.pretrain_dataset import (
preprocess_the_pile_gen,
preprocess_wudao_gen,
pretrain_collate_fn_gen,
)
from configs.train_config import *
accelerator = Accelerator()
if accelerator.is_main_process:
wandb.init(
project='LLAMA Pretrain'
)
wandb.init(project="LLAMA Pretrain")
log_interval *= accelerator.gradient_accumulation_steps
eval_interval *= accelerator.gradient_accumulation_steps
@ -44,51 +46,74 @@ tokenizer = Tokenizer(sp_model)
paths = create_shard_kwargs(patterns)
random.shuffle(paths)
transform_dict = {
'wudao': preprocess_wudao_gen(tokenizer, max_length),
'pile': preprocess_the_pile_gen(tokenizer, max_length)
"wudao": preprocess_wudao_gen(tokenizer, max_length),
"pile": preprocess_the_pile_gen(tokenizer, max_length),
}
data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={
'paths': paths,
'transform_dict': transform_dict,
'process_index': accelerator.process_index,
'num_processes': accelerator.num_processes
})
train_loader = DataLoader(data_set, batch_size=train_batch_size, num_workers=1,
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), drop_last=True)
data_set = IterableDataset.from_generator(
create_data_iter,
gen_kwargs={
"paths": paths,
"transform_dict": transform_dict,
"process_index": accelerator.process_index,
"num_processes": accelerator.num_processes,
},
)
train_loader = DataLoader(
data_set,
batch_size=train_batch_size,
num_workers=1,
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
drop_last=True,
)
# smaller initializer_range make training more stable
# add stabel embedding to token embedding
raw_model = LlamaForCausalLM(LlamaConfig(vocab_size=tokenizer.vocab_size,
initializer_range=initializer_range,
pad_token_id=tokenizer.pad_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True))
raw_model = LlamaForCausalLM(
LlamaConfig(
vocab_size=tokenizer.vocab_size,
initializer_range=initializer_range,
pad_token_id=tokenizer.pad_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
)
)
raw_model.eval()
with torch.no_grad():
summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in raw_model.named_parameters() if not any(nd in n for nd in no_decay)],
"params": [
p
for n, p in raw_model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": weight_decay,
},
{
"params": [p for n, p in raw_model.named_parameters() if any(nd in n for nd in no_decay)],
"params": [
p
for n, p in raw_model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
optim.zero_grad()
factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps * factor,
num_training_steps=num_training_steps * factor)
scheduler = get_cosine_schedule_with_warmup(
optim,
num_warmup_steps=num_warmup_steps * factor,
num_training_steps=num_training_steps * factor,
)
_, model, optim, scheduler = accelerator.prepare(
train_loader, raw_model, optim, scheduler
)
print('start training...')
print("start training...")
train_loader_iter = iter(train_loader)
global_step = 0
start_time = time.time()
@ -98,13 +123,11 @@ for data_step in range(num_training_steps):
batch = next(train_loader_iter)
for k, v in batch.items():
batch[k] = v.to(accelerator.device)
labels = batch['input_ids'].clone()
labels[labels==tokenizer.pad_id] = -100
labels = batch["input_ids"].clone()
labels[labels == tokenizer.pad_id] = -100
out = model(**batch, labels=labels)
total_loss = out.loss
losses = {
'total_loss': total_loss
}
losses = {"total_loss": total_loss}
accelerator.backward(total_loss)
optim.step()
scheduler.step()
@ -115,34 +138,41 @@ for data_step in range(num_training_steps):
cost_time = time.time() - start_time
start_time = time.time()
tokens = train_batch_size * log_interval * max_length
wandb.log({'Training/Token per second per gpu': tokens/cost_time})
wandb.log({"Training/Token per second per gpu": tokens / cost_time})
for k, v in losses.items():
wandb.log({'Losses/{}'.format(k): v})
current_lr = optim.param_groups[0]['lr']
wandb.log({'Training/LR': current_lr})
wandb.log({"Losses/{}".format(k): v})
current_lr = optim.param_groups[0]["lr"]
wandb.log({"Training/LR": current_lr})
if optim.scaler is not None:
wandb.log({'Training/Loss Scale': optim.scaler.get_scale()})
wandb.log({'Training/Data Step': data_step})
wandb.log({'Training/Global Step': global_step})
accelerator.print('Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}'.format(
global_step, data_step, losses['total_loss'], tokens/cost_time))
wandb.log({"Training/Loss Scale": optim.scaler.get_scale()})
wandb.log({"Training/Data Step": data_step})
wandb.log({"Training/Global Step": global_step})
accelerator.print(
"Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format(
global_step, data_step, losses["total_loss"], tokens / cost_time
)
)
if data_step % eval_interval == 0 and accelerator.is_main_process:
text_table = wandb.Table(columns=['question', 'pred'])
text_table = wandb.Table(columns=["question", "pred"])
model.eval()
with torch.no_grad():
for data in val_set:
raw_inputs = data
inputs_len = len(raw_inputs)
inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False)
inputs = tokenizer(
raw_inputs, return_tensors=True, add_special_tokens=False
)
for k, v in inputs.items():
inputs[k] = v.to(accelerator.device)
pred = model.generate(**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0)
pred = model.generate(
**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0
)
pred = tokenizer.decode(pred.cpu())[0]
pred = pred[inputs_len:]
text_table.add_data(raw_inputs, pred)
wandb.log({'Predictions on {}'.format(global_step) : text_table})
wandb.log({"Predictions on {}".format(global_step): text_table})
if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
if not os.path.isdir(work_dir):
os.mkdir(work_dir)
torch.save(raw_model.state_dict(), '{}/{}.pt'.format(work_dir, global_step))
torch.save(raw_model.state_dict(), "{}/{}.pt".format(work_dir, global_step))
wandb.finish()