diff --git a/configs/train_config.py b/configs/train_config.py index b0ceb80..aede51b 100644 --- a/configs/train_config.py +++ b/configs/train_config.py @@ -5,12 +5,10 @@ num_warmup_steps = 2000 initializer_range = 1e-2 lr = 2e-4 weight_decay = 1e-1 -tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model' -patterns = [ - 'data/pretrain_data/part-*.jsonl.zst' -] +tokenizer_model_path = "configs/10w_vocab_wudao5_pile10.model" +patterns = ["data/pretrain_data/part-*.jsonl.zst"] # global step log_interval = 5 eval_interval = 200 save_interval = 800 -work_dir = 'data/saved_ckpt/' \ No newline at end of file +work_dir = "data/saved_ckpt/" diff --git a/data/preprocess_the_pile.py b/data/preprocess_the_pile.py index 3b7a6f0..860aa4a 100644 --- a/data/preprocess_the_pile.py +++ b/data/preprocess_the_pile.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-16 22:35:38 LastEditors: LiangSong(sl12160010@gmail.com) @@ -8,25 +8,25 @@ Description: Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, making it easy for parallel training to perform streaming reads. Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import json from glob import glob from tqdm import tqdm import zstandard as zstd -paths = glob('data/the_pile/*.jsonl.zst') -write_path = 'data/pretrain_data/part-pile-{}.jsonl.zst' +paths = glob("data/the_pile/*.jsonl.zst") +write_path = "data/pretrain_data/part-pile-{}.jsonl.zst" total_num = 0 file_num = 0 -wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8') +wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") for path in tqdm(paths, total=len(paths)): - with zstd.open(path, 'r', encoding='utf-8') as fp: + with zstd.open(path, "r", encoding="utf-8") as fp: for line in fp: if total_num % 16384 == 0 and total_num > 0: file_num += 1 wfp.close() - wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8') - wfp.write(line.encode('utf-8')) + wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") + wfp.write(line.encode("utf-8")) total_num += 1 wfp.close() -print('total line: {}\ntotal files: {}'.format(total_num, file_num)) \ No newline at end of file +print("total line: {}\ntotal files: {}".format(total_num, file_num)) diff --git a/data/preprocess_wudao.py b/data/preprocess_wudao.py index 10f4abb..d8aecc5 100644 --- a/data/preprocess_wudao.py +++ b/data/preprocess_wudao.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-16 22:10:44 LastEditors: LiangSong(sl12160010@gmail.com) @@ -8,27 +8,27 @@ Description: Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, making it easy for parallel training to perform streaming reads. Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import json from glob import glob from tqdm import tqdm import zstandard as zstd -paths = glob('data/WuDaoCorpus2.0_base_200G/part*') -write_path = 'data/pretrain_data/part-wudao-{}.jsonl.zst' +paths = glob("data/WuDaoCorpus2.0_base_200G/part*") +write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst" total_num = 0 file_num = 0 -wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8') +wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") for path in tqdm(paths, total=len(paths)): - with open(path, 'r') as fp: + with open(path, "r") as fp: data = json.load(fp) for line in data: if total_num % 16384 == 0 and total_num > 0: file_num += 1 wfp.close() - wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8') - wfp.write(json.dumps(line).encode('utf-8')) - wfp.write('\n'.encode('utf-8')) + wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") + wfp.write(json.dumps(line).encode("utf-8")) + wfp.write("\n".encode("utf-8")) total_num += 1 wfp.close() -print('total line: {}\ntotal files: {}'.format(total_num, file_num)) \ No newline at end of file +print("total line: {}\ntotal files: {}".format(total_num, file_num)) diff --git a/dataset/data_iter.py b/dataset/data_iter.py index 05c8c85..9088051 100644 --- a/dataset/data_iter.py +++ b/dataset/data_iter.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-17 19:32:20 LastEditors: LiangSong(sl12160010@gmail.com) @@ -7,50 +7,32 @@ FilePath: /Open-Llama/dataset/data_iter.py Description: Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import json from glob import glob import zstandard as zstd def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1): - ''' - Currently, the allowed storage formats are jsonl and jsonl.zst. + """ + Currently, the allowed storage formats are jsonl and jsonl.zst. Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading. - ''' + """ past = None for i, path in paths: - dataset_name = path.split('-')[-2] + dataset_name = path.split("-")[-2] if past != dataset_name: - print('Loading data from {}'.format(path)) + print("Loading data from {}".format(path)) past = path if num_processes > 1 and i % num_processes != process_index: continue - if path.endswith('jsonl.zst'): - with zstd.open(path, 'r', encoding='utf-8') as fp: - for line in fp: - if isinstance(line, bytes): - line = line.decode('utf-8') - line = json.loads(line) - line['dataset'] = dataset_name - if transform_dict: - line = transform_dict[dataset_name](line) - if isinstance(line, str): - yield line - elif isinstance(line, list): - for i in line: - yield i - else: - raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name])) - else: - yield line - elif path.endswith('jsonl'): - with open(path, 'r') as fp: + if path.endswith("jsonl.zst"): + with zstd.open(path, "r", encoding="utf-8") as fp: for line in fp: if isinstance(line, bytes): - line = line.decode('utf-8') + line = line.decode("utf-8") line = json.loads(line) - line['dataset'] = dataset_name + line["dataset"] = dataset_name if transform_dict: line = transform_dict[dataset_name](line) if isinstance(line, str): @@ -59,34 +41,57 @@ def create_data_iter(paths, transform_dict=None, process_index=0, num_processes= for i in line: yield i else: - raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name])) + raise Exception( + "Unsupported type in Transformation: {}".format( + transform_dict[dataset_name] + ) + ) + else: + yield line + elif path.endswith("jsonl"): + with open(path, "r") as fp: + for line in fp: + if isinstance(line, bytes): + line = line.decode("utf-8") + line = json.loads(line) + line["dataset"] = dataset_name + if transform_dict: + line = transform_dict[dataset_name](line) + if isinstance(line, str): + yield line + elif isinstance(line, list): + for i in line: + yield i + else: + raise Exception( + "Unsupported type in Transformation: {}".format( + transform_dict[dataset_name] + ) + ) else: yield line else: - raise Exception('File format of {} is not supported yet.'.format(path)) + raise Exception("File format of {} is not supported yet.".format(path)) + def create_shard_kwargs(patterns, repeat=1): - ''' - Assign numbers to different shards of data to ensure that data is not duplicated + """ + Assign numbers to different shards of data to ensure that data is not duplicated when allocated to different nodes during distributed training. - ''' + """ all_path = [] for p in patterns: all_path.extend(glob(p)) all_path *= repeat return [(i, p) for i, p in enumerate(all_path)] -if __name__ == '__main__': - patterns = [ - 'data/pretrain_data/part-wudao*.jsonl.zst' - ] + +if __name__ == "__main__": + patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"] paths = create_shard_kwargs(patterns) - transform_dict = { - 'wudao': lambda x: x['title'], - 'pile': lambda x: [x['text']] - } + transform_dict = {"wudao": lambda x: x["title"], "pile": lambda x: [x["text"]]} data_iter = create_data_iter(paths, transform_dict=transform_dict) for i, data in enumerate(data_iter): print(i, data) if i == 20: - break \ No newline at end of file + break diff --git a/dataset/pretrain_dataset.py b/dataset/pretrain_dataset.py index b7bc395..55de7e3 100644 --- a/dataset/pretrain_dataset.py +++ b/dataset/pretrain_dataset.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-17 20:41:25 LastEditors: LiangSong(sl12160010@gmail.com) @@ -7,76 +7,95 @@ FilePath: /Open-Llama/dataset/pretrain_dataset.py Description: Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import math import torch + def preprocess_wudao_gen(tokenizer, segment_max_length=1024): def preprocess_wudao(line): - ''' + """ The format of the data is roughly as follows. {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'} Split the data based on the tokenized length according to the maximum length. - ''' - total = line['title'] + '\n' + line['content'] + """ + total = line["title"] + "\n" + line["content"] out = tokenizer(total) - input_ids = out['input_ids'] - return [input_ids[i*segment_max_length: (i+1)*segment_max_length] - for i in range(math.ceil(len(input_ids)/segment_max_length))] + input_ids = out["input_ids"] + return [ + input_ids[i * segment_max_length : (i + 1) * segment_max_length] + for i in range(math.ceil(len(input_ids) / segment_max_length)) + ] + return preprocess_wudao + def preprocess_the_pile_gen(tokenizer, segment_max_length=1024): def preprocess_the_pile(line): - ''' + """ The format of the data is roughly as follows. {'text': 'some text', 'meta': {'pile_set_name': 'Github'}} Split the data based on the tokenized length according to the maximum length. - ''' - total = line['text'] + """ + total = line["text"] out = tokenizer(total) - input_ids = out['input_ids'] - return [input_ids[i*segment_max_length: (i+1)*segment_max_length] - for i in range(math.ceil(len(input_ids)/segment_max_length))] + input_ids = out["input_ids"] + return [ + input_ids[i * segment_max_length : (i + 1) * segment_max_length] + for i in range(math.ceil(len(input_ids) / segment_max_length)) + ] + return preprocess_the_pile + def pretrain_collate_fn_gen(tokenizer, segment_max_length=1024): - ''' + """ Organize data into tensors by padding based on the preset maximum length. - ''' + """ pad_id = tokenizer.pad_id + def pretrain_collate_fn(batch): input_ids = [] for i in batch: input_len = len(i) - input_ids.append(i+[pad_id]*(segment_max_length-input_len)) + input_ids.append(i + [pad_id] * (segment_max_length - input_len)) inputs = { - 'input_ids': torch.tensor(input_ids, dtype=torch.int64), + "input_ids": torch.tensor(input_ids, dtype=torch.int64), } return inputs + return pretrain_collate_fn -if __name__ == '__main__': + +if __name__ == "__main__": import sentencepiece as spm from datasets import IterableDataset from torch.utils.data import DataLoader from dataset.tokenizer import Tokenizer from dataset.data_iter import create_shard_kwargs, create_data_iter - - sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model') + + sp_model = spm.SentencePieceProcessor( + model_file="configs/10w_vocab_wudao5_pile10.model" + ) tokenizer = Tokenizer(sp_model) - patterns = [ - 'data/pretrain_data/part-*.jsonl.zst' - ] + patterns = ["data/pretrain_data/part-*.jsonl.zst"] paths = create_shard_kwargs(patterns) transform_dict = { - 'wudao': preprocess_wudao_gen(tokenizer), - 'pile': preprocess_the_pile_gen(tokenizer) + "wudao": preprocess_wudao_gen(tokenizer), + "pile": preprocess_the_pile_gen(tokenizer), } - data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={'paths': paths, 'transform_dict': transform_dict}) - train_loader = DataLoader(data_set, batch_size=8, num_workers=4, - collate_fn=pretrain_collate_fn_gen(tokenizer), drop_last=True) + data_set = IterableDataset.from_generator( + create_data_iter, gen_kwargs={"paths": paths, "transform_dict": transform_dict} + ) + train_loader = DataLoader( + data_set, + batch_size=8, + num_workers=4, + collate_fn=pretrain_collate_fn_gen(tokenizer), + drop_last=True, + ) for batch in train_loader: for k, v in batch.items(): print(k, v.shape) - break \ No newline at end of file + break diff --git a/dataset/tokenizer.py b/dataset/tokenizer.py index ba2fd97..044a973 100644 --- a/dataset/tokenizer.py +++ b/dataset/tokenizer.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-20 21:39:47 LastEditors: LiangSong(sl12160010@gmail.com) @@ -7,9 +7,10 @@ FilePath: /Open-Llama/dataset/tokenizer.py Description: Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import torch + class Tokenizer: def __init__(self, sp_model): self.sp_model = sp_model @@ -18,90 +19,130 @@ class Tokenizer: self.pad_id = self.sp_model.pad_id() self.vocab_size = self.sp_model.vocab_size() - def __call__(self, inputs, padding=None, max_length=256, return_tensors=False, truncation=False, - add_special_tokens=True, return_mask=False): + def __call__( + self, + inputs, + padding=None, + max_length=256, + return_tensors=False, + truncation=False, + add_special_tokens=True, + return_mask=False, + ): if isinstance(inputs, str): - return self.encode(inputs, padding=padding, max_length=max_length, - return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask) + return self.encode( + inputs, + padding=padding, + max_length=max_length, + return_tensors=return_tensors, + truncation=truncation, + add_special_tokens=add_special_tokens, + return_mask=return_mask, + ) else: - return self.encode_batch(inputs, padding=padding, max_length=max_length, - return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask) + return self.encode_batch( + inputs, + padding=padding, + max_length=max_length, + return_tensors=return_tensors, + truncation=truncation, + add_special_tokens=add_special_tokens, + return_mask=return_mask, + ) - def encode(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, - add_special_tokens=True, return_mask=False): - assert(isinstance(inputs, str)) + def encode( + self, + inputs, + padding=None, + max_length=8192, + return_tensors=False, + truncation=False, + add_special_tokens=True, + return_mask=False, + ): + assert isinstance(inputs, str) input_ids = self.sp_model.Encode(inputs) if return_mask: attention_mask = [1] * len(input_ids) if truncation: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780 # 参考Transformer中的实现 默认最后一位一定是pad或者eos - input_ids = input_ids[: max_length-1] + input_ids = input_ids[: max_length - 1] if return_mask: - attention_mask = attention_mask[: max_length-1] + attention_mask = attention_mask[: max_length - 1] if add_special_tokens: input_ids = input_ids + [self.eos_id] if return_mask: attention_mask = attention_mask + [0] - if padding == 'max_length': - input_ids = input_ids + [self.pad_id] * (max_length-len(input_ids)) + if padding == "max_length": + input_ids = input_ids + [self.pad_id] * (max_length - len(input_ids)) if return_mask: - attention_mask = attention_mask + [0] * (max_length-len(attention_mask)) + attention_mask = attention_mask + [0] * ( + max_length - len(attention_mask) + ) if return_tensors: input_ids = torch.tensor([input_ids]) out = { - 'input_ids': input_ids, + "input_ids": input_ids, } if return_mask: attention_mask = torch.tensor([attention_mask]) - out['attention_mask'] = attention_mask + out["attention_mask"] = attention_mask else: out = { - 'input_ids': input_ids, + "input_ids": input_ids, } if return_mask: - out['attention_mask'] = attention_mask + out["attention_mask"] = attention_mask return out - def encode_batch(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, - add_special_tokens=True, return_mask=False): + def encode_batch( + self, + inputs, + padding=None, + max_length=8192, + return_tensors=False, + truncation=False, + add_special_tokens=True, + return_mask=False, + ): input_ids = self.sp_model.Encode(inputs) if return_mask: attention_mask = [[1] * len(i) for i in input_ids] if truncation: - input_ids = [i[: max_length-1] for i in input_ids] + input_ids = [i[: max_length - 1] for i in input_ids] if return_mask: - attention_mask = [i[: max_length-1] for i in attention_mask] + attention_mask = [i[: max_length - 1] for i in attention_mask] if add_special_tokens: - input_ids = [i+[self.eos_id] for i in input_ids] + input_ids = [i + [self.eos_id] for i in input_ids] if return_mask: - attention_mask = [i+[0] for i in attention_mask] - if padding == 'max_length': + attention_mask = [i + [0] for i in attention_mask] + if padding == "max_length": input_ids_pad = [] if return_mask: attention_mask_pad = [] for idx, i in enumerate(input_ids): - input_ids_pad.append(i + [self.pad_id] * (max_length-len(i))) + input_ids_pad.append(i + [self.pad_id] * (max_length - len(i))) if return_mask: j = attention_mask[idx] - attention_mask_pad.append(j + [0] * (max_length-len(j))) + attention_mask_pad.append(j + [0] * (max_length - len(j))) input_ids = input_ids_pad if return_mask: attention_mask = attention_mask_pad if return_tensors: input_ids = torch.tensor(input_ids) out = { - 'input_ids': input_ids, + "input_ids": input_ids, } if return_mask: attention_mask = torch.tensor(attention_mask) - out['attention_mask'] = attention_mask + out["attention_mask"] = attention_mask else: out = { - 'input_ids': input_ids, + "input_ids": input_ids, } if return_mask: - out['attention_mask'] = attention_mask + out["attention_mask"] = attention_mask return out def decode(self, inputs): @@ -110,37 +151,48 @@ class Tokenizer: for i in inputs: if self.eos_id in i: eos_idx = i.index(self.eos_id) - i = i[: eos_idx] + i = i[:eos_idx] out.append(i) out = self.sp_model.Decode(out) return out -if __name__ == '__main__': + +if __name__ == "__main__": import sentencepiece as spm from unicodedata import normalize + # Using sentencepiece may not be able to process some reserved keywords like '▁'. - sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model') + sp_model = spm.SentencePieceProcessor( + model_file="configs/10w_vocab_wudao5_pile10.model" + ) tokenizer = Tokenizer(sp_model) - tmp = ['hello world', - '这是开源项目的V1版本,this is the first version of a open-source project!', - '# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)'] + tmp = [ + "hello world", + "这是开源项目的V1版本,this is the first version of a open-source project!", + "# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)", + ] print(tmp) - out = tokenizer(tmp, padding='max_length', return_tensors=True, max_length=64, truncation=True) + out = tokenizer( + tmp, padding="max_length", return_tensors=True, max_length=64, truncation=True + ) for k, v in out.items(): print(k, v.shape) - print(out['input_ids']) - out = tokenizer.decode(out['input_ids']) + print(out["input_ids"]) + out = tokenizer.decode(out["input_ids"]) print(out) for i, j in zip(tmp, out): - assert(normalize('NFKC', i) == j) + assert normalize("NFKC", i) == j from dataset.data_iter import create_shard_kwargs, create_data_iter - patterns = [ - 'data/pretrain_data/part-wudao*.jsonl.zst' - ] + + patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"] paths = create_shard_kwargs(patterns) data_iter = create_data_iter(paths) for i, data in enumerate(data_iter): - assert(normalize('NFKC', data['content']) == sp_model.Decode(sp_model.Encode(data['content'])) or '▁' in data['content']) + assert ( + normalize("NFKC", data["content"]) + == sp_model.Decode(sp_model.Encode(data["content"])) + or "▁" in data["content"] + ) if i == 1000: - break \ No newline at end of file + break diff --git a/dataset/train_tokenizer.py b/dataset/train_tokenizer.py index 50da406..609c8d9 100644 --- a/dataset/train_tokenizer.py +++ b/dataset/train_tokenizer.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-24 20:49:03 LastEditors: LiangSong(sl12160010@gmail.com) @@ -7,25 +7,25 @@ FilePath: /Open-Llama/dataset/train_tokenizer.py Description: Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import random from dataset.data_iter import create_data_iter, create_shard_kwargs wudao_patterns = [ - 'data/pretrain_data/part-wudao-*.jsonl.zst', + "data/pretrain_data/part-wudao-*.jsonl.zst", ] wudao_paths = create_shard_kwargs(wudao_patterns) random.shuffle(wudao_paths) pile_patterns = [ - 'data/pretrain_data/part-pile-*.jsonl.zst', + "data/pretrain_data/part-pile-*.jsonl.zst", ] pile_paths = create_shard_kwargs(pile_patterns) random.shuffle(pile_paths) -paths = wudao_paths[: 5] + pile_paths[: 10] +paths = wudao_paths[:5] + pile_paths[:10] transform_dict = { - 'wudao': lambda line: [(line['title'] + '\n' + line['content'])], - 'pile': lambda line: [line['text']] + "wudao": lambda line: [(line["title"] + "\n" + line["content"])], + "pile": lambda line: [line["text"]], } data_iter = create_data_iter(paths, transform_dict) @@ -35,19 +35,30 @@ import sentencepiece as spm # Loads model from URL as iterator and stores the model to BytesIO. model = io.BytesIO() spm.SentencePieceTrainer.train( - sentence_iterator=data_iter, model_writer=model, shuffle_input_sentence=False, train_extremely_large_corpus=True, - # hyperparameters of tokenizer - max_sentence_length=16384, pad_id=3, model_type='BPE', vocab_size=100000, - # split digits and fallback to byte same as Llama. - # set split_by_unicode_script to True to avoid grouping punctuation and characters together. - split_digits=True, split_by_unicode_script=True, byte_fallback=True, - # reserve whitespace and \n and \t etc. for code generation - allow_whitespace_only_pieces=True, remove_extra_whitespaces=False, normalization_rule_name='nfkc') + sentence_iterator=data_iter, + model_writer=model, + shuffle_input_sentence=False, + train_extremely_large_corpus=True, + # hyperparameters of tokenizer + max_sentence_length=16384, + pad_id=3, + model_type="BPE", + vocab_size=100000, + # split digits and fallback to byte same as Llama. + # set split_by_unicode_script to True to avoid grouping punctuation and characters together. + split_digits=True, + split_by_unicode_script=True, + byte_fallback=True, + # reserve whitespace and \n and \t etc. for code generation + allow_whitespace_only_pieces=True, + remove_extra_whitespaces=False, + normalization_rule_name="nfkc", +) # Serialize the model as file. -with open('configs/10w_vocab_wudao5_pile10.model', 'wb') as f: - f.write(model.getvalue()) +with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f: + f.write(model.getvalue()) # Directly load the model from serialized model. sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) -print(sp.decode(sp.encode('只因你太美🤗▃ \n 1'))) \ No newline at end of file +print(sp.decode(sp.encode("只因你太美🤗▃ \n 1"))) diff --git a/dataset/validation.py b/dataset/validation.py index 71bd221..8f50df6 100644 --- a/dataset/validation.py +++ b/dataset/validation.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-18 00:06:41 LastEditors: LiangSong(sl12160010@gmail.com) @@ -7,21 +7,21 @@ FilePath: /Open-Llama/dataset/validation.py Description: Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" val_set = [ - '白日依山尽,', - '君不见,黄河之水天上来,奔流到海不复回。君不见,', - '秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。', - '古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?', - '当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。', - '这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,', - '人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如', - 'In recent years, there has been increasing interest in the use of machine learning to', - '已知三个数分别为1, 2, 3,则它们的平均数是', - '小明总共有15个苹果,他分别给了3个人两个苹果,然后自己又吃了一个苹果,那么它还剩几个苹果?', - '根据牛顿第二定律,物体的加速度等于', - '碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳', - '下面是一段用python写的快速排序的代码:', - 'The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on', - '下面是一个使用 PyTorch 和 Transformer 的示例代码,用于训练一个文本分类模型:import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset' -] \ No newline at end of file + "白日依山尽,", + "君不见,黄河之水天上来,奔流到海不复回。君不见,", + "秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。", + "古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?", + "当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。", + "这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,", + "人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如", + "In recent years, there has been increasing interest in the use of machine learning to", + "已知三个数分别为1, 2, 3,则它们的平均数是", + "小明总共有15个苹果,他分别给了3个人两个苹果,然后自己又吃了一个苹果,那么它还剩几个苹果?", + "根据牛顿第二定律,物体的加速度等于", + "碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳", + "下面是一段用python写的快速排序的代码:", + "The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on", + "下面是一个使用 PyTorch 和 Transformer 的示例代码,用于训练一个文本分类模型:import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset", +] diff --git a/models/llama.py b/models/llama.py index 280c3cb..b7dbbee 100644 --- a/models/llama.py +++ b/models/llama.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-17 13:21:33 LastEditors: LiangSong(sl12160010@gmail.com) @@ -9,4 +9,4 @@ Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf Performance and effectiveness optimization based on the implementation in the Transformer library. https://github.com/Bayes-Song/transformers Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" diff --git a/pretrain_llama.py b/pretrain_llama.py index 95380e9..0c6be25 100644 --- a/pretrain_llama.py +++ b/pretrain_llama.py @@ -1,4 +1,4 @@ -''' +""" Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-17 14:27:28 LastEditors: LiangSong(sl12160010@gmail.com) @@ -7,7 +7,7 @@ FilePath: /Open-Llama/pretrain_llama.py Description: pretrain GPT Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. -''' +""" import os import time import wandb @@ -24,15 +24,17 @@ from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with from dataset.validation import val_set from dataset.tokenizer import Tokenizer from dataset.data_iter import create_shard_kwargs, create_data_iter -from dataset.pretrain_dataset import preprocess_the_pile_gen, preprocess_wudao_gen, pretrain_collate_fn_gen +from dataset.pretrain_dataset import ( + preprocess_the_pile_gen, + preprocess_wudao_gen, + pretrain_collate_fn_gen, +) from configs.train_config import * accelerator = Accelerator() if accelerator.is_main_process: - wandb.init( - project='LLAMA Pretrain' - ) + wandb.init(project="LLAMA Pretrain") log_interval *= accelerator.gradient_accumulation_steps eval_interval *= accelerator.gradient_accumulation_steps @@ -44,51 +46,74 @@ tokenizer = Tokenizer(sp_model) paths = create_shard_kwargs(patterns) random.shuffle(paths) transform_dict = { - 'wudao': preprocess_wudao_gen(tokenizer, max_length), - 'pile': preprocess_the_pile_gen(tokenizer, max_length) + "wudao": preprocess_wudao_gen(tokenizer, max_length), + "pile": preprocess_the_pile_gen(tokenizer, max_length), } -data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={ - 'paths': paths, - 'transform_dict': transform_dict, - 'process_index': accelerator.process_index, - 'num_processes': accelerator.num_processes -}) -train_loader = DataLoader(data_set, batch_size=train_batch_size, num_workers=1, -collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), drop_last=True) +data_set = IterableDataset.from_generator( + create_data_iter, + gen_kwargs={ + "paths": paths, + "transform_dict": transform_dict, + "process_index": accelerator.process_index, + "num_processes": accelerator.num_processes, + }, +) +train_loader = DataLoader( + data_set, + batch_size=train_batch_size, + num_workers=1, + collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), + drop_last=True, +) # smaller initializer_range make training more stable # add stabel embedding to token embedding -raw_model = LlamaForCausalLM(LlamaConfig(vocab_size=tokenizer.vocab_size, - initializer_range=initializer_range, - pad_token_id=tokenizer.pad_id, - rms_norm_eps=1e-5, - hidden_dropout_prob=0.1, - attention_dropout_prob=0.1, - use_stable_embedding=True, - shared_input_output_embedding=True)) +raw_model = LlamaForCausalLM( + LlamaConfig( + vocab_size=tokenizer.vocab_size, + initializer_range=initializer_range, + pad_token_id=tokenizer.pad_id, + rms_norm_eps=1e-5, + hidden_dropout_prob=0.1, + attention_dropout_prob=0.1, + use_stable_embedding=True, + shared_input_output_embedding=True, + ) +) raw_model.eval() with torch.no_grad(): summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda()) no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"] optimizer_grouped_parameters = [ { - "params": [p for n, p in raw_model.named_parameters() if not any(nd in n for nd in no_decay)], + "params": [ + p + for n, p in raw_model.named_parameters() + if not any(nd in n for nd in no_decay) + ], "weight_decay": weight_decay, }, { - "params": [p for n, p in raw_model.named_parameters() if any(nd in n for nd in no_decay)], + "params": [ + p + for n, p in raw_model.named_parameters() + if any(nd in n for nd in no_decay) + ], "weight_decay": 0.0, }, ] optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95)) optim.zero_grad() factor = accelerator.num_processes / accelerator.gradient_accumulation_steps -scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps * factor, - num_training_steps=num_training_steps * factor) +scheduler = get_cosine_schedule_with_warmup( + optim, + num_warmup_steps=num_warmup_steps * factor, + num_training_steps=num_training_steps * factor, +) _, model, optim, scheduler = accelerator.prepare( train_loader, raw_model, optim, scheduler ) -print('start training...') +print("start training...") train_loader_iter = iter(train_loader) global_step = 0 start_time = time.time() @@ -98,13 +123,11 @@ for data_step in range(num_training_steps): batch = next(train_loader_iter) for k, v in batch.items(): batch[k] = v.to(accelerator.device) - labels = batch['input_ids'].clone() - labels[labels==tokenizer.pad_id] = -100 + labels = batch["input_ids"].clone() + labels[labels == tokenizer.pad_id] = -100 out = model(**batch, labels=labels) total_loss = out.loss - losses = { - 'total_loss': total_loss - } + losses = {"total_loss": total_loss} accelerator.backward(total_loss) optim.step() scheduler.step() @@ -115,34 +138,41 @@ for data_step in range(num_training_steps): cost_time = time.time() - start_time start_time = time.time() tokens = train_batch_size * log_interval * max_length - wandb.log({'Training/Token per second per gpu': tokens/cost_time}) + wandb.log({"Training/Token per second per gpu": tokens / cost_time}) for k, v in losses.items(): - wandb.log({'Losses/{}'.format(k): v}) - current_lr = optim.param_groups[0]['lr'] - wandb.log({'Training/LR': current_lr}) + wandb.log({"Losses/{}".format(k): v}) + current_lr = optim.param_groups[0]["lr"] + wandb.log({"Training/LR": current_lr}) if optim.scaler is not None: - wandb.log({'Training/Loss Scale': optim.scaler.get_scale()}) - wandb.log({'Training/Data Step': data_step}) - wandb.log({'Training/Global Step': global_step}) - accelerator.print('Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}'.format( - global_step, data_step, losses['total_loss'], tokens/cost_time)) + wandb.log({"Training/Loss Scale": optim.scaler.get_scale()}) + wandb.log({"Training/Data Step": data_step}) + wandb.log({"Training/Global Step": global_step}) + accelerator.print( + "Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format( + global_step, data_step, losses["total_loss"], tokens / cost_time + ) + ) if data_step % eval_interval == 0 and accelerator.is_main_process: - text_table = wandb.Table(columns=['question', 'pred']) + text_table = wandb.Table(columns=["question", "pred"]) model.eval() with torch.no_grad(): for data in val_set: raw_inputs = data inputs_len = len(raw_inputs) - inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False) + inputs = tokenizer( + raw_inputs, return_tensors=True, add_special_tokens=False + ) for k, v in inputs.items(): inputs[k] = v.to(accelerator.device) - pred = model.generate(**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0) + pred = model.generate( + **inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0 + ) pred = tokenizer.decode(pred.cpu())[0] pred = pred[inputs_len:] text_table.add_data(raw_inputs, pred) - wandb.log({'Predictions on {}'.format(global_step) : text_table}) + wandb.log({"Predictions on {}".format(global_step): text_table}) if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process: if not os.path.isdir(work_dir): os.mkdir(work_dir) - torch.save(raw_model.state_dict(), '{}/{}.pt'.format(work_dir, global_step)) + torch.save(raw_model.state_dict(), "{}/{}.pt".format(work_dir, global_step)) wandb.finish()