reformat code with black
This commit is contained in:
parent
8dd47aee90
commit
918a8cdc3d
|
@ -5,12 +5,10 @@ num_warmup_steps = 2000
|
|||
initializer_range = 1e-2
|
||||
lr = 2e-4
|
||||
weight_decay = 1e-1
|
||||
tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model'
|
||||
patterns = [
|
||||
'data/pretrain_data/part-*.jsonl.zst'
|
||||
]
|
||||
tokenizer_model_path = "configs/10w_vocab_wudao5_pile10.model"
|
||||
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
|
||||
# global step
|
||||
log_interval = 5
|
||||
eval_interval = 200
|
||||
save_interval = 800
|
||||
work_dir = 'data/saved_ckpt/'
|
||||
work_dir = "data/saved_ckpt/"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-16 22:35:38
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -8,25 +8,25 @@ Description:
|
|||
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
|
||||
making it easy for parallel training to perform streaming reads.
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import json
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
import zstandard as zstd
|
||||
|
||||
paths = glob('data/the_pile/*.jsonl.zst')
|
||||
write_path = 'data/pretrain_data/part-pile-{}.jsonl.zst'
|
||||
paths = glob("data/the_pile/*.jsonl.zst")
|
||||
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
|
||||
total_num = 0
|
||||
file_num = 0
|
||||
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
|
||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||
for path in tqdm(paths, total=len(paths)):
|
||||
with zstd.open(path, 'r', encoding='utf-8') as fp:
|
||||
with zstd.open(path, "r", encoding="utf-8") as fp:
|
||||
for line in fp:
|
||||
if total_num % 16384 == 0 and total_num > 0:
|
||||
file_num += 1
|
||||
wfp.close()
|
||||
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
|
||||
wfp.write(line.encode('utf-8'))
|
||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||
wfp.write(line.encode("utf-8"))
|
||||
total_num += 1
|
||||
wfp.close()
|
||||
print('total line: {}\ntotal files: {}'.format(total_num, file_num))
|
||||
print("total line: {}\ntotal files: {}".format(total_num, file_num))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-16 22:10:44
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -8,27 +8,27 @@ Description:
|
|||
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
|
||||
making it easy for parallel training to perform streaming reads.
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import json
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
import zstandard as zstd
|
||||
|
||||
paths = glob('data/WuDaoCorpus2.0_base_200G/part*')
|
||||
write_path = 'data/pretrain_data/part-wudao-{}.jsonl.zst'
|
||||
paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
|
||||
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
|
||||
total_num = 0
|
||||
file_num = 0
|
||||
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
|
||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||
for path in tqdm(paths, total=len(paths)):
|
||||
with open(path, 'r') as fp:
|
||||
with open(path, "r") as fp:
|
||||
data = json.load(fp)
|
||||
for line in data:
|
||||
if total_num % 16384 == 0 and total_num > 0:
|
||||
file_num += 1
|
||||
wfp.close()
|
||||
wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
|
||||
wfp.write(json.dumps(line).encode('utf-8'))
|
||||
wfp.write('\n'.encode('utf-8'))
|
||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||
wfp.write(json.dumps(line).encode("utf-8"))
|
||||
wfp.write("\n".encode("utf-8"))
|
||||
total_num += 1
|
||||
wfp.close()
|
||||
print('total line: {}\ntotal files: {}'.format(total_num, file_num))
|
||||
print("total line: {}\ntotal files: {}".format(total_num, file_num))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-17 19:32:20
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -7,50 +7,32 @@ FilePath: /Open-Llama/dataset/data_iter.py
|
|||
Description:
|
||||
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import json
|
||||
from glob import glob
|
||||
import zstandard as zstd
|
||||
|
||||
|
||||
def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1):
|
||||
'''
|
||||
Currently, the allowed storage formats are jsonl and jsonl.zst.
|
||||
"""
|
||||
Currently, the allowed storage formats are jsonl and jsonl.zst.
|
||||
Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
|
||||
'''
|
||||
"""
|
||||
past = None
|
||||
for i, path in paths:
|
||||
dataset_name = path.split('-')[-2]
|
||||
dataset_name = path.split("-")[-2]
|
||||
if past != dataset_name:
|
||||
print('Loading data from {}'.format(path))
|
||||
print("Loading data from {}".format(path))
|
||||
past = path
|
||||
if num_processes > 1 and i % num_processes != process_index:
|
||||
continue
|
||||
if path.endswith('jsonl.zst'):
|
||||
with zstd.open(path, 'r', encoding='utf-8') as fp:
|
||||
for line in fp:
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode('utf-8')
|
||||
line = json.loads(line)
|
||||
line['dataset'] = dataset_name
|
||||
if transform_dict:
|
||||
line = transform_dict[dataset_name](line)
|
||||
if isinstance(line, str):
|
||||
yield line
|
||||
elif isinstance(line, list):
|
||||
for i in line:
|
||||
yield i
|
||||
else:
|
||||
raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
|
||||
else:
|
||||
yield line
|
||||
elif path.endswith('jsonl'):
|
||||
with open(path, 'r') as fp:
|
||||
if path.endswith("jsonl.zst"):
|
||||
with zstd.open(path, "r", encoding="utf-8") as fp:
|
||||
for line in fp:
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode('utf-8')
|
||||
line = line.decode("utf-8")
|
||||
line = json.loads(line)
|
||||
line['dataset'] = dataset_name
|
||||
line["dataset"] = dataset_name
|
||||
if transform_dict:
|
||||
line = transform_dict[dataset_name](line)
|
||||
if isinstance(line, str):
|
||||
|
@ -59,34 +41,57 @@ def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=
|
|||
for i in line:
|
||||
yield i
|
||||
else:
|
||||
raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
|
||||
raise Exception(
|
||||
"Unsupported type in Transformation: {}".format(
|
||||
transform_dict[dataset_name]
|
||||
)
|
||||
)
|
||||
else:
|
||||
yield line
|
||||
elif path.endswith("jsonl"):
|
||||
with open(path, "r") as fp:
|
||||
for line in fp:
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode("utf-8")
|
||||
line = json.loads(line)
|
||||
line["dataset"] = dataset_name
|
||||
if transform_dict:
|
||||
line = transform_dict[dataset_name](line)
|
||||
if isinstance(line, str):
|
||||
yield line
|
||||
elif isinstance(line, list):
|
||||
for i in line:
|
||||
yield i
|
||||
else:
|
||||
raise Exception(
|
||||
"Unsupported type in Transformation: {}".format(
|
||||
transform_dict[dataset_name]
|
||||
)
|
||||
)
|
||||
else:
|
||||
yield line
|
||||
else:
|
||||
raise Exception('File format of {} is not supported yet.'.format(path))
|
||||
raise Exception("File format of {} is not supported yet.".format(path))
|
||||
|
||||
|
||||
def create_shard_kwargs(patterns, repeat=1):
|
||||
'''
|
||||
Assign numbers to different shards of data to ensure that data is not duplicated
|
||||
"""
|
||||
Assign numbers to different shards of data to ensure that data is not duplicated
|
||||
when allocated to different nodes during distributed training.
|
||||
'''
|
||||
"""
|
||||
all_path = []
|
||||
for p in patterns:
|
||||
all_path.extend(glob(p))
|
||||
all_path *= repeat
|
||||
return [(i, p) for i, p in enumerate(all_path)]
|
||||
|
||||
if __name__ == '__main__':
|
||||
patterns = [
|
||||
'data/pretrain_data/part-wudao*.jsonl.zst'
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
|
||||
paths = create_shard_kwargs(patterns)
|
||||
transform_dict = {
|
||||
'wudao': lambda x: x['title'],
|
||||
'pile': lambda x: [x['text']]
|
||||
}
|
||||
transform_dict = {"wudao": lambda x: x["title"], "pile": lambda x: [x["text"]]}
|
||||
data_iter = create_data_iter(paths, transform_dict=transform_dict)
|
||||
for i, data in enumerate(data_iter):
|
||||
print(i, data)
|
||||
if i == 20:
|
||||
break
|
||||
break
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-17 20:41:25
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -7,76 +7,95 @@ FilePath: /Open-Llama/dataset/pretrain_dataset.py
|
|||
Description:
|
||||
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import math
|
||||
import torch
|
||||
|
||||
|
||||
def preprocess_wudao_gen(tokenizer, segment_max_length=1024):
|
||||
def preprocess_wudao(line):
|
||||
'''
|
||||
"""
|
||||
The format of the data is roughly as follows.
|
||||
{'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
|
||||
Split the data based on the tokenized length according to the maximum length.
|
||||
'''
|
||||
total = line['title'] + '\n' + line['content']
|
||||
"""
|
||||
total = line["title"] + "\n" + line["content"]
|
||||
out = tokenizer(total)
|
||||
input_ids = out['input_ids']
|
||||
return [input_ids[i*segment_max_length: (i+1)*segment_max_length]
|
||||
for i in range(math.ceil(len(input_ids)/segment_max_length))]
|
||||
input_ids = out["input_ids"]
|
||||
return [
|
||||
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
|
||||
for i in range(math.ceil(len(input_ids) / segment_max_length))
|
||||
]
|
||||
|
||||
return preprocess_wudao
|
||||
|
||||
|
||||
def preprocess_the_pile_gen(tokenizer, segment_max_length=1024):
|
||||
def preprocess_the_pile(line):
|
||||
'''
|
||||
"""
|
||||
The format of the data is roughly as follows.
|
||||
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
|
||||
Split the data based on the tokenized length according to the maximum length.
|
||||
'''
|
||||
total = line['text']
|
||||
"""
|
||||
total = line["text"]
|
||||
out = tokenizer(total)
|
||||
input_ids = out['input_ids']
|
||||
return [input_ids[i*segment_max_length: (i+1)*segment_max_length]
|
||||
for i in range(math.ceil(len(input_ids)/segment_max_length))]
|
||||
input_ids = out["input_ids"]
|
||||
return [
|
||||
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
|
||||
for i in range(math.ceil(len(input_ids) / segment_max_length))
|
||||
]
|
||||
|
||||
return preprocess_the_pile
|
||||
|
||||
|
||||
def pretrain_collate_fn_gen(tokenizer, segment_max_length=1024):
|
||||
'''
|
||||
"""
|
||||
Organize data into tensors by padding based on the preset maximum length.
|
||||
'''
|
||||
"""
|
||||
pad_id = tokenizer.pad_id
|
||||
|
||||
def pretrain_collate_fn(batch):
|
||||
input_ids = []
|
||||
for i in batch:
|
||||
input_len = len(i)
|
||||
input_ids.append(i+[pad_id]*(segment_max_length-input_len))
|
||||
input_ids.append(i + [pad_id] * (segment_max_length - input_len))
|
||||
inputs = {
|
||||
'input_ids': torch.tensor(input_ids, dtype=torch.int64),
|
||||
"input_ids": torch.tensor(input_ids, dtype=torch.int64),
|
||||
}
|
||||
return inputs
|
||||
|
||||
return pretrain_collate_fn
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sentencepiece as spm
|
||||
from datasets import IterableDataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from dataset.tokenizer import Tokenizer
|
||||
from dataset.data_iter import create_shard_kwargs, create_data_iter
|
||||
|
||||
sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
|
||||
|
||||
sp_model = spm.SentencePieceProcessor(
|
||||
model_file="configs/10w_vocab_wudao5_pile10.model"
|
||||
)
|
||||
tokenizer = Tokenizer(sp_model)
|
||||
patterns = [
|
||||
'data/pretrain_data/part-*.jsonl.zst'
|
||||
]
|
||||
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
|
||||
paths = create_shard_kwargs(patterns)
|
||||
transform_dict = {
|
||||
'wudao': preprocess_wudao_gen(tokenizer),
|
||||
'pile': preprocess_the_pile_gen(tokenizer)
|
||||
"wudao": preprocess_wudao_gen(tokenizer),
|
||||
"pile": preprocess_the_pile_gen(tokenizer),
|
||||
}
|
||||
data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={'paths': paths, 'transform_dict': transform_dict})
|
||||
train_loader = DataLoader(data_set, batch_size=8, num_workers=4,
|
||||
collate_fn=pretrain_collate_fn_gen(tokenizer), drop_last=True)
|
||||
data_set = IterableDataset.from_generator(
|
||||
create_data_iter, gen_kwargs={"paths": paths, "transform_dict": transform_dict}
|
||||
)
|
||||
train_loader = DataLoader(
|
||||
data_set,
|
||||
batch_size=8,
|
||||
num_workers=4,
|
||||
collate_fn=pretrain_collate_fn_gen(tokenizer),
|
||||
drop_last=True,
|
||||
)
|
||||
for batch in train_loader:
|
||||
for k, v in batch.items():
|
||||
print(k, v.shape)
|
||||
break
|
||||
break
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-20 21:39:47
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -7,9 +7,10 @@ FilePath: /Open-Llama/dataset/tokenizer.py
|
|||
Description:
|
||||
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import torch
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self, sp_model):
|
||||
self.sp_model = sp_model
|
||||
|
@ -18,90 +19,130 @@ class Tokenizer:
|
|||
self.pad_id = self.sp_model.pad_id()
|
||||
self.vocab_size = self.sp_model.vocab_size()
|
||||
|
||||
def __call__(self, inputs, padding=None, max_length=256, return_tensors=False, truncation=False,
|
||||
add_special_tokens=True, return_mask=False):
|
||||
def __call__(
|
||||
self,
|
||||
inputs,
|
||||
padding=None,
|
||||
max_length=256,
|
||||
return_tensors=False,
|
||||
truncation=False,
|
||||
add_special_tokens=True,
|
||||
return_mask=False,
|
||||
):
|
||||
if isinstance(inputs, str):
|
||||
return self.encode(inputs, padding=padding, max_length=max_length,
|
||||
return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
|
||||
return self.encode(
|
||||
inputs,
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
return_tensors=return_tensors,
|
||||
truncation=truncation,
|
||||
add_special_tokens=add_special_tokens,
|
||||
return_mask=return_mask,
|
||||
)
|
||||
else:
|
||||
return self.encode_batch(inputs, padding=padding, max_length=max_length,
|
||||
return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
|
||||
return self.encode_batch(
|
||||
inputs,
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
return_tensors=return_tensors,
|
||||
truncation=truncation,
|
||||
add_special_tokens=add_special_tokens,
|
||||
return_mask=return_mask,
|
||||
)
|
||||
|
||||
def encode(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False,
|
||||
add_special_tokens=True, return_mask=False):
|
||||
assert(isinstance(inputs, str))
|
||||
def encode(
|
||||
self,
|
||||
inputs,
|
||||
padding=None,
|
||||
max_length=8192,
|
||||
return_tensors=False,
|
||||
truncation=False,
|
||||
add_special_tokens=True,
|
||||
return_mask=False,
|
||||
):
|
||||
assert isinstance(inputs, str)
|
||||
input_ids = self.sp_model.Encode(inputs)
|
||||
if return_mask:
|
||||
attention_mask = [1] * len(input_ids)
|
||||
if truncation:
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780
|
||||
# 参考Transformer中的实现 默认最后一位一定是pad或者eos
|
||||
input_ids = input_ids[: max_length-1]
|
||||
input_ids = input_ids[: max_length - 1]
|
||||
if return_mask:
|
||||
attention_mask = attention_mask[: max_length-1]
|
||||
attention_mask = attention_mask[: max_length - 1]
|
||||
if add_special_tokens:
|
||||
input_ids = input_ids + [self.eos_id]
|
||||
if return_mask:
|
||||
attention_mask = attention_mask + [0]
|
||||
if padding == 'max_length':
|
||||
input_ids = input_ids + [self.pad_id] * (max_length-len(input_ids))
|
||||
if padding == "max_length":
|
||||
input_ids = input_ids + [self.pad_id] * (max_length - len(input_ids))
|
||||
if return_mask:
|
||||
attention_mask = attention_mask + [0] * (max_length-len(attention_mask))
|
||||
attention_mask = attention_mask + [0] * (
|
||||
max_length - len(attention_mask)
|
||||
)
|
||||
if return_tensors:
|
||||
input_ids = torch.tensor([input_ids])
|
||||
out = {
|
||||
'input_ids': input_ids,
|
||||
"input_ids": input_ids,
|
||||
}
|
||||
if return_mask:
|
||||
attention_mask = torch.tensor([attention_mask])
|
||||
out['attention_mask'] = attention_mask
|
||||
out["attention_mask"] = attention_mask
|
||||
else:
|
||||
out = {
|
||||
'input_ids': input_ids,
|
||||
"input_ids": input_ids,
|
||||
}
|
||||
if return_mask:
|
||||
out['attention_mask'] = attention_mask
|
||||
out["attention_mask"] = attention_mask
|
||||
return out
|
||||
|
||||
def encode_batch(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False,
|
||||
add_special_tokens=True, return_mask=False):
|
||||
def encode_batch(
|
||||
self,
|
||||
inputs,
|
||||
padding=None,
|
||||
max_length=8192,
|
||||
return_tensors=False,
|
||||
truncation=False,
|
||||
add_special_tokens=True,
|
||||
return_mask=False,
|
||||
):
|
||||
input_ids = self.sp_model.Encode(inputs)
|
||||
if return_mask:
|
||||
attention_mask = [[1] * len(i) for i in input_ids]
|
||||
if truncation:
|
||||
input_ids = [i[: max_length-1] for i in input_ids]
|
||||
input_ids = [i[: max_length - 1] for i in input_ids]
|
||||
if return_mask:
|
||||
attention_mask = [i[: max_length-1] for i in attention_mask]
|
||||
attention_mask = [i[: max_length - 1] for i in attention_mask]
|
||||
if add_special_tokens:
|
||||
input_ids = [i+[self.eos_id] for i in input_ids]
|
||||
input_ids = [i + [self.eos_id] for i in input_ids]
|
||||
if return_mask:
|
||||
attention_mask = [i+[0] for i in attention_mask]
|
||||
if padding == 'max_length':
|
||||
attention_mask = [i + [0] for i in attention_mask]
|
||||
if padding == "max_length":
|
||||
input_ids_pad = []
|
||||
if return_mask:
|
||||
attention_mask_pad = []
|
||||
for idx, i in enumerate(input_ids):
|
||||
input_ids_pad.append(i + [self.pad_id] * (max_length-len(i)))
|
||||
input_ids_pad.append(i + [self.pad_id] * (max_length - len(i)))
|
||||
if return_mask:
|
||||
j = attention_mask[idx]
|
||||
attention_mask_pad.append(j + [0] * (max_length-len(j)))
|
||||
attention_mask_pad.append(j + [0] * (max_length - len(j)))
|
||||
input_ids = input_ids_pad
|
||||
if return_mask:
|
||||
attention_mask = attention_mask_pad
|
||||
if return_tensors:
|
||||
input_ids = torch.tensor(input_ids)
|
||||
out = {
|
||||
'input_ids': input_ids,
|
||||
"input_ids": input_ids,
|
||||
}
|
||||
if return_mask:
|
||||
attention_mask = torch.tensor(attention_mask)
|
||||
out['attention_mask'] = attention_mask
|
||||
out["attention_mask"] = attention_mask
|
||||
else:
|
||||
out = {
|
||||
'input_ids': input_ids,
|
||||
"input_ids": input_ids,
|
||||
}
|
||||
if return_mask:
|
||||
out['attention_mask'] = attention_mask
|
||||
out["attention_mask"] = attention_mask
|
||||
return out
|
||||
|
||||
def decode(self, inputs):
|
||||
|
@ -110,37 +151,48 @@ class Tokenizer:
|
|||
for i in inputs:
|
||||
if self.eos_id in i:
|
||||
eos_idx = i.index(self.eos_id)
|
||||
i = i[: eos_idx]
|
||||
i = i[:eos_idx]
|
||||
out.append(i)
|
||||
out = self.sp_model.Decode(out)
|
||||
return out
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sentencepiece as spm
|
||||
from unicodedata import normalize
|
||||
|
||||
# Using sentencepiece may not be able to process some reserved keywords like '▁'.
|
||||
sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
|
||||
sp_model = spm.SentencePieceProcessor(
|
||||
model_file="configs/10w_vocab_wudao5_pile10.model"
|
||||
)
|
||||
tokenizer = Tokenizer(sp_model)
|
||||
tmp = ['hello world',
|
||||
'这是开源项目的V1版本,this is the first version of a open-source project!',
|
||||
'# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)']
|
||||
tmp = [
|
||||
"hello world",
|
||||
"这是开源项目的V1版本,this is the first version of a open-source project!",
|
||||
"# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)",
|
||||
]
|
||||
print(tmp)
|
||||
out = tokenizer(tmp, padding='max_length', return_tensors=True, max_length=64, truncation=True)
|
||||
out = tokenizer(
|
||||
tmp, padding="max_length", return_tensors=True, max_length=64, truncation=True
|
||||
)
|
||||
for k, v in out.items():
|
||||
print(k, v.shape)
|
||||
print(out['input_ids'])
|
||||
out = tokenizer.decode(out['input_ids'])
|
||||
print(out["input_ids"])
|
||||
out = tokenizer.decode(out["input_ids"])
|
||||
print(out)
|
||||
for i, j in zip(tmp, out):
|
||||
assert(normalize('NFKC', i) == j)
|
||||
assert normalize("NFKC", i) == j
|
||||
|
||||
from dataset.data_iter import create_shard_kwargs, create_data_iter
|
||||
patterns = [
|
||||
'data/pretrain_data/part-wudao*.jsonl.zst'
|
||||
]
|
||||
|
||||
patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
|
||||
paths = create_shard_kwargs(patterns)
|
||||
data_iter = create_data_iter(paths)
|
||||
for i, data in enumerate(data_iter):
|
||||
assert(normalize('NFKC', data['content']) == sp_model.Decode(sp_model.Encode(data['content'])) or '▁' in data['content'])
|
||||
assert (
|
||||
normalize("NFKC", data["content"])
|
||||
== sp_model.Decode(sp_model.Encode(data["content"]))
|
||||
or "▁" in data["content"]
|
||||
)
|
||||
if i == 1000:
|
||||
break
|
||||
break
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-24 20:49:03
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -7,25 +7,25 @@ FilePath: /Open-Llama/dataset/train_tokenizer.py
|
|||
Description:
|
||||
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import random
|
||||
from dataset.data_iter import create_data_iter, create_shard_kwargs
|
||||
|
||||
wudao_patterns = [
|
||||
'data/pretrain_data/part-wudao-*.jsonl.zst',
|
||||
"data/pretrain_data/part-wudao-*.jsonl.zst",
|
||||
]
|
||||
wudao_paths = create_shard_kwargs(wudao_patterns)
|
||||
random.shuffle(wudao_paths)
|
||||
|
||||
pile_patterns = [
|
||||
'data/pretrain_data/part-pile-*.jsonl.zst',
|
||||
"data/pretrain_data/part-pile-*.jsonl.zst",
|
||||
]
|
||||
pile_paths = create_shard_kwargs(pile_patterns)
|
||||
random.shuffle(pile_paths)
|
||||
paths = wudao_paths[: 5] + pile_paths[: 10]
|
||||
paths = wudao_paths[:5] + pile_paths[:10]
|
||||
transform_dict = {
|
||||
'wudao': lambda line: [(line['title'] + '\n' + line['content'])],
|
||||
'pile': lambda line: [line['text']]
|
||||
"wudao": lambda line: [(line["title"] + "\n" + line["content"])],
|
||||
"pile": lambda line: [line["text"]],
|
||||
}
|
||||
data_iter = create_data_iter(paths, transform_dict)
|
||||
|
||||
|
@ -35,19 +35,30 @@ import sentencepiece as spm
|
|||
# Loads model from URL as iterator and stores the model to BytesIO.
|
||||
model = io.BytesIO()
|
||||
spm.SentencePieceTrainer.train(
|
||||
sentence_iterator=data_iter, model_writer=model, shuffle_input_sentence=False, train_extremely_large_corpus=True,
|
||||
# hyperparameters of tokenizer
|
||||
max_sentence_length=16384, pad_id=3, model_type='BPE', vocab_size=100000,
|
||||
# split digits and fallback to byte same as Llama.
|
||||
# set split_by_unicode_script to True to avoid grouping punctuation and characters together.
|
||||
split_digits=True, split_by_unicode_script=True, byte_fallback=True,
|
||||
# reserve whitespace and \n and \t etc. for code generation
|
||||
allow_whitespace_only_pieces=True, remove_extra_whitespaces=False, normalization_rule_name='nfkc')
|
||||
sentence_iterator=data_iter,
|
||||
model_writer=model,
|
||||
shuffle_input_sentence=False,
|
||||
train_extremely_large_corpus=True,
|
||||
# hyperparameters of tokenizer
|
||||
max_sentence_length=16384,
|
||||
pad_id=3,
|
||||
model_type="BPE",
|
||||
vocab_size=100000,
|
||||
# split digits and fallback to byte same as Llama.
|
||||
# set split_by_unicode_script to True to avoid grouping punctuation and characters together.
|
||||
split_digits=True,
|
||||
split_by_unicode_script=True,
|
||||
byte_fallback=True,
|
||||
# reserve whitespace and \n and \t etc. for code generation
|
||||
allow_whitespace_only_pieces=True,
|
||||
remove_extra_whitespaces=False,
|
||||
normalization_rule_name="nfkc",
|
||||
)
|
||||
|
||||
# Serialize the model as file.
|
||||
with open('configs/10w_vocab_wudao5_pile10.model', 'wb') as f:
|
||||
f.write(model.getvalue())
|
||||
with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f:
|
||||
f.write(model.getvalue())
|
||||
|
||||
# Directly load the model from serialized model.
|
||||
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
|
||||
print(sp.decode(sp.encode('只因你太美🤗▃ \n 1')))
|
||||
print(sp.decode(sp.encode("只因你太美🤗▃ \n 1")))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-18 00:06:41
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -7,21 +7,21 @@ FilePath: /Open-Llama/dataset/validation.py
|
|||
Description:
|
||||
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
val_set = [
|
||||
'白日依山尽,',
|
||||
'君不见,黄河之水天上来,奔流到海不复回。君不见,',
|
||||
'秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。',
|
||||
'古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?',
|
||||
'当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。',
|
||||
'这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,',
|
||||
'人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如',
|
||||
'In recent years, there has been increasing interest in the use of machine learning to',
|
||||
'已知三个数分别为1, 2, 3,则它们的平均数是',
|
||||
'小明总共有15个苹果,他分别给了3个人两个苹果,然后自己又吃了一个苹果,那么它还剩几个苹果?',
|
||||
'根据牛顿第二定律,物体的加速度等于',
|
||||
'碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳',
|
||||
'下面是一段用python写的快速排序的代码:',
|
||||
'The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on',
|
||||
'下面是一个使用 PyTorch 和 Transformer 的示例代码,用于训练一个文本分类模型:import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset'
|
||||
]
|
||||
"白日依山尽,",
|
||||
"君不见,黄河之水天上来,奔流到海不复回。君不见,",
|
||||
"秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。",
|
||||
"古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?",
|
||||
"当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。",
|
||||
"这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,",
|
||||
"人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如",
|
||||
"In recent years, there has been increasing interest in the use of machine learning to",
|
||||
"已知三个数分别为1, 2, 3,则它们的平均数是",
|
||||
"小明总共有15个苹果,他分别给了3个人两个苹果,然后自己又吃了一个苹果,那么它还剩几个苹果?",
|
||||
"根据牛顿第二定律,物体的加速度等于",
|
||||
"碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳",
|
||||
"下面是一段用python写的快速排序的代码:",
|
||||
"The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on",
|
||||
"下面是一个使用 PyTorch 和 Transformer 的示例代码,用于训练一个文本分类模型:import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset",
|
||||
]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-17 13:21:33
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -9,4 +9,4 @@ Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf
|
|||
Performance and effectiveness optimization based on the implementation in the Transformer library.
|
||||
https://github.com/Bayes-Song/transformers
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
'''
|
||||
"""
|
||||
Author: LiangSong(sl12160010@gmail.com)
|
||||
Date: 2023-03-17 14:27:28
|
||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||
|
@ -7,7 +7,7 @@ FilePath: /Open-Llama/pretrain_llama.py
|
|||
Description:
|
||||
pretrain GPT
|
||||
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||
'''
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import wandb
|
||||
|
@ -24,15 +24,17 @@ from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with
|
|||
from dataset.validation import val_set
|
||||
from dataset.tokenizer import Tokenizer
|
||||
from dataset.data_iter import create_shard_kwargs, create_data_iter
|
||||
from dataset.pretrain_dataset import preprocess_the_pile_gen, preprocess_wudao_gen, pretrain_collate_fn_gen
|
||||
from dataset.pretrain_dataset import (
|
||||
preprocess_the_pile_gen,
|
||||
preprocess_wudao_gen,
|
||||
pretrain_collate_fn_gen,
|
||||
)
|
||||
from configs.train_config import *
|
||||
|
||||
accelerator = Accelerator()
|
||||
|
||||
if accelerator.is_main_process:
|
||||
wandb.init(
|
||||
project='LLAMA Pretrain'
|
||||
)
|
||||
wandb.init(project="LLAMA Pretrain")
|
||||
|
||||
log_interval *= accelerator.gradient_accumulation_steps
|
||||
eval_interval *= accelerator.gradient_accumulation_steps
|
||||
|
@ -44,51 +46,74 @@ tokenizer = Tokenizer(sp_model)
|
|||
paths = create_shard_kwargs(patterns)
|
||||
random.shuffle(paths)
|
||||
transform_dict = {
|
||||
'wudao': preprocess_wudao_gen(tokenizer, max_length),
|
||||
'pile': preprocess_the_pile_gen(tokenizer, max_length)
|
||||
"wudao": preprocess_wudao_gen(tokenizer, max_length),
|
||||
"pile": preprocess_the_pile_gen(tokenizer, max_length),
|
||||
}
|
||||
data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={
|
||||
'paths': paths,
|
||||
'transform_dict': transform_dict,
|
||||
'process_index': accelerator.process_index,
|
||||
'num_processes': accelerator.num_processes
|
||||
})
|
||||
train_loader = DataLoader(data_set, batch_size=train_batch_size, num_workers=1,
|
||||
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), drop_last=True)
|
||||
data_set = IterableDataset.from_generator(
|
||||
create_data_iter,
|
||||
gen_kwargs={
|
||||
"paths": paths,
|
||||
"transform_dict": transform_dict,
|
||||
"process_index": accelerator.process_index,
|
||||
"num_processes": accelerator.num_processes,
|
||||
},
|
||||
)
|
||||
train_loader = DataLoader(
|
||||
data_set,
|
||||
batch_size=train_batch_size,
|
||||
num_workers=1,
|
||||
collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
|
||||
drop_last=True,
|
||||
)
|
||||
# smaller initializer_range make training more stable
|
||||
# add stabel embedding to token embedding
|
||||
raw_model = LlamaForCausalLM(LlamaConfig(vocab_size=tokenizer.vocab_size,
|
||||
initializer_range=initializer_range,
|
||||
pad_token_id=tokenizer.pad_id,
|
||||
rms_norm_eps=1e-5,
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_dropout_prob=0.1,
|
||||
use_stable_embedding=True,
|
||||
shared_input_output_embedding=True))
|
||||
raw_model = LlamaForCausalLM(
|
||||
LlamaConfig(
|
||||
vocab_size=tokenizer.vocab_size,
|
||||
initializer_range=initializer_range,
|
||||
pad_token_id=tokenizer.pad_id,
|
||||
rms_norm_eps=1e-5,
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_dropout_prob=0.1,
|
||||
use_stable_embedding=True,
|
||||
shared_input_output_embedding=True,
|
||||
)
|
||||
)
|
||||
raw_model.eval()
|
||||
with torch.no_grad():
|
||||
summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
|
||||
no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in raw_model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"params": [
|
||||
p
|
||||
for n, p in raw_model.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)
|
||||
],
|
||||
"weight_decay": weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in raw_model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"params": [
|
||||
p
|
||||
for n, p in raw_model.named_parameters()
|
||||
if any(nd in n for nd in no_decay)
|
||||
],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
|
||||
optim.zero_grad()
|
||||
factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
|
||||
scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps * factor,
|
||||
num_training_steps=num_training_steps * factor)
|
||||
scheduler = get_cosine_schedule_with_warmup(
|
||||
optim,
|
||||
num_warmup_steps=num_warmup_steps * factor,
|
||||
num_training_steps=num_training_steps * factor,
|
||||
)
|
||||
|
||||
_, model, optim, scheduler = accelerator.prepare(
|
||||
train_loader, raw_model, optim, scheduler
|
||||
)
|
||||
print('start training...')
|
||||
print("start training...")
|
||||
train_loader_iter = iter(train_loader)
|
||||
global_step = 0
|
||||
start_time = time.time()
|
||||
|
@ -98,13 +123,11 @@ for data_step in range(num_training_steps):
|
|||
batch = next(train_loader_iter)
|
||||
for k, v in batch.items():
|
||||
batch[k] = v.to(accelerator.device)
|
||||
labels = batch['input_ids'].clone()
|
||||
labels[labels==tokenizer.pad_id] = -100
|
||||
labels = batch["input_ids"].clone()
|
||||
labels[labels == tokenizer.pad_id] = -100
|
||||
out = model(**batch, labels=labels)
|
||||
total_loss = out.loss
|
||||
losses = {
|
||||
'total_loss': total_loss
|
||||
}
|
||||
losses = {"total_loss": total_loss}
|
||||
accelerator.backward(total_loss)
|
||||
optim.step()
|
||||
scheduler.step()
|
||||
|
@ -115,34 +138,41 @@ for data_step in range(num_training_steps):
|
|||
cost_time = time.time() - start_time
|
||||
start_time = time.time()
|
||||
tokens = train_batch_size * log_interval * max_length
|
||||
wandb.log({'Training/Token per second per gpu': tokens/cost_time})
|
||||
wandb.log({"Training/Token per second per gpu": tokens / cost_time})
|
||||
for k, v in losses.items():
|
||||
wandb.log({'Losses/{}'.format(k): v})
|
||||
current_lr = optim.param_groups[0]['lr']
|
||||
wandb.log({'Training/LR': current_lr})
|
||||
wandb.log({"Losses/{}".format(k): v})
|
||||
current_lr = optim.param_groups[0]["lr"]
|
||||
wandb.log({"Training/LR": current_lr})
|
||||
if optim.scaler is not None:
|
||||
wandb.log({'Training/Loss Scale': optim.scaler.get_scale()})
|
||||
wandb.log({'Training/Data Step': data_step})
|
||||
wandb.log({'Training/Global Step': global_step})
|
||||
accelerator.print('Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}'.format(
|
||||
global_step, data_step, losses['total_loss'], tokens/cost_time))
|
||||
wandb.log({"Training/Loss Scale": optim.scaler.get_scale()})
|
||||
wandb.log({"Training/Data Step": data_step})
|
||||
wandb.log({"Training/Global Step": global_step})
|
||||
accelerator.print(
|
||||
"Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format(
|
||||
global_step, data_step, losses["total_loss"], tokens / cost_time
|
||||
)
|
||||
)
|
||||
if data_step % eval_interval == 0 and accelerator.is_main_process:
|
||||
text_table = wandb.Table(columns=['question', 'pred'])
|
||||
text_table = wandb.Table(columns=["question", "pred"])
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
for data in val_set:
|
||||
raw_inputs = data
|
||||
inputs_len = len(raw_inputs)
|
||||
inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False)
|
||||
inputs = tokenizer(
|
||||
raw_inputs, return_tensors=True, add_special_tokens=False
|
||||
)
|
||||
for k, v in inputs.items():
|
||||
inputs[k] = v.to(accelerator.device)
|
||||
pred = model.generate(**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0)
|
||||
pred = model.generate(
|
||||
**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0
|
||||
)
|
||||
pred = tokenizer.decode(pred.cpu())[0]
|
||||
pred = pred[inputs_len:]
|
||||
text_table.add_data(raw_inputs, pred)
|
||||
wandb.log({'Predictions on {}'.format(global_step) : text_table})
|
||||
wandb.log({"Predictions on {}".format(global_step): text_table})
|
||||
if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
|
||||
if not os.path.isdir(work_dir):
|
||||
os.mkdir(work_dir)
|
||||
torch.save(raw_model.state_dict(), '{}/{}.pt'.format(work_dir, global_step))
|
||||
torch.save(raw_model.state_dict(), "{}/{}.pt".format(work_dir, global_step))
|
||||
wandb.finish()
|
||||
|
|
Loading…
Reference in New Issue
Block a user