reformat code with black

2023-03-27 14:34:59 +08:00 · 2023-03-27 14:34:59 +08:00 · 918a8cdc3d
commit 918a8cdc3d
parent 8dd47aee90
10 changed files with 346 additions and 231 deletions
--- a/configs/train_config.py
+++ b/configs/train_config.py
@ -5,12 +5,10 @@ num_warmup_steps = 2000
 initializer_range = 1e-2
 lr = 2e-4
 weight_decay = 1e-1
-tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model'
-patterns = [
-    'data/pretrain_data/part-*.jsonl.zst'
-]
+tokenizer_model_path = "configs/10w_vocab_wudao5_pile10.model"
+patterns = ["data/pretrain_data/part-*.jsonl.zst"]
 # global step
 log_interval = 5
 eval_interval = 200
 save_interval = 800
-work_dir = 'data/saved_ckpt/'
+work_dir = "data/saved_ckpt/"
--- a/data/preprocess_the_pile.py
+++ b/data/preprocess_the_pile.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-16 22:35:38
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -8,25 +8,25 @@ Description:
 Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 
 making it easy for parallel training to perform streaming reads.
 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import json
 from glob import glob
 from tqdm import tqdm
 import zstandard as zstd

-paths = glob('data/the_pile/*.jsonl.zst')
-write_path = 'data/pretrain_data/part-pile-{}.jsonl.zst'
+paths = glob("data/the_pile/*.jsonl.zst")
+write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
 total_num = 0
 file_num = 0
-wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
+wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for path in tqdm(paths, total=len(paths)):
-    with zstd.open(path, 'r', encoding='utf-8') as fp:
+    with zstd.open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            if total_num % 16384 == 0 and total_num > 0:
                file_num += 1
                wfp.close()
-                wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
-            wfp.write(line.encode('utf-8'))
+                wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
+            wfp.write(line.encode("utf-8"))
            total_num += 1
 wfp.close()
-print('total line: {}\ntotal files: {}'.format(total_num, file_num))
+print("total line: {}\ntotal files: {}".format(total_num, file_num))
--- a/data/preprocess_wudao.py
+++ b/data/preprocess_wudao.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-16 22:10:44
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -8,27 +8,27 @@ Description:
 Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 
 making it easy for parallel training to perform streaming reads.
 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import json
 from glob import glob
 from tqdm import tqdm
 import zstandard as zstd

-paths = glob('data/WuDaoCorpus2.0_base_200G/part*')
-write_path = 'data/pretrain_data/part-wudao-{}.jsonl.zst'
+paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
+write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
 total_num = 0
 file_num = 0
-wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
+wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for path in tqdm(paths, total=len(paths)):
-    with open(path, 'r') as fp:
+    with open(path, "r") as fp:
        data = json.load(fp)
    for line in data:
        if total_num % 16384 == 0 and total_num > 0:
            file_num += 1
            wfp.close()
-            wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
-        wfp.write(json.dumps(line).encode('utf-8'))
-        wfp.write('\n'.encode('utf-8'))
+            wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
+        wfp.write(json.dumps(line).encode("utf-8"))
+        wfp.write("\n".encode("utf-8"))
        total_num += 1
 wfp.close()
-print('total line: {}\ntotal files: {}'.format(total_num, file_num))
+print("total line: {}\ntotal files: {}".format(total_num, file_num))
--- a/dataset/data_iter.py
+++ b/dataset/data_iter.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-17 19:32:20
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,50 +7,32 @@ FilePath: /Open-Llama/dataset/data_iter.py
 Description: 

 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import json
 from glob import glob
 import zstandard as zstd


 def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1):
-    '''
-    Currently, the allowed storage formats are jsonl and jsonl.zst. 
+    """
+    Currently, the allowed storage formats are jsonl and jsonl.zst.
    Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
-    '''
+    """
    past = None
    for i, path in paths:
-        dataset_name = path.split('-')[-2]
+        dataset_name = path.split("-")[-2]
        if past != dataset_name:
-            print('Loading data from {}'.format(path))
+            print("Loading data from {}".format(path))
            past = path
        if num_processes > 1 and i % num_processes != process_index:
            continue
-        if path.endswith('jsonl.zst'):
-             with zstd.open(path, 'r', encoding='utf-8') as fp:
-                 for line in fp:
-                    if isinstance(line, bytes):
-                        line = line.decode('utf-8')
-                    line = json.loads(line)
-                    line['dataset'] = dataset_name
-                    if transform_dict:
-                        line = transform_dict[dataset_name](line)
-                        if isinstance(line, str):
-                            yield line
-                        elif isinstance(line, list):
-                            for i in line:
-                                yield i
-                        else:
-                            raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
-                    else:
-                        yield line
-        elif path.endswith('jsonl'):
-            with open(path, 'r') as fp:
+        if path.endswith("jsonl.zst"):
+            with zstd.open(path, "r", encoding="utf-8") as fp:
                for line in fp:
                    if isinstance(line, bytes):
-                        line = line.decode('utf-8')
+                        line = line.decode("utf-8")
                    line = json.loads(line)
-                    line['dataset'] = dataset_name
+                    line["dataset"] = dataset_name
                    if transform_dict:
                        line = transform_dict[dataset_name](line)
                        if isinstance(line, str):
@ -59,34 +41,57 @@ def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=
                            for i in line:
                                yield i
                        else:
-                            raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
+                            raise Exception(
+                                "Unsupported type in Transformation: {}".format(
+                                    transform_dict[dataset_name]
+                                )
+                            )
+                    else:
+                        yield line
+        elif path.endswith("jsonl"):
+            with open(path, "r") as fp:
+                for line in fp:
+                    if isinstance(line, bytes):
+                        line = line.decode("utf-8")
+                    line = json.loads(line)
+                    line["dataset"] = dataset_name
+                    if transform_dict:
+                        line = transform_dict[dataset_name](line)
+                        if isinstance(line, str):
+                            yield line
+                        elif isinstance(line, list):
+                            for i in line:
+                                yield i
+                        else:
+                            raise Exception(
+                                "Unsupported type in Transformation: {}".format(
+                                    transform_dict[dataset_name]
+                                )
+                            )
                    else:
                        yield line
        else:
-            raise Exception('File format of {} is not supported yet.'.format(path))
+            raise Exception("File format of {} is not supported yet.".format(path))
+

 def create_shard_kwargs(patterns, repeat=1):
-    '''
-    Assign numbers to different shards of data to ensure that data is not duplicated 
+    """
+    Assign numbers to different shards of data to ensure that data is not duplicated
    when allocated to different nodes during distributed training.
-    '''
+    """
    all_path = []
    for p in patterns:
        all_path.extend(glob(p))
    all_path *= repeat
    return [(i, p) for i, p in enumerate(all_path)]

-if __name__ == '__main__':
-    patterns = [
-        'data/pretrain_data/part-wudao*.jsonl.zst'
-    ]
+
+if __name__ == "__main__":
+    patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
    paths = create_shard_kwargs(patterns)
-    transform_dict = {
-        'wudao': lambda x: x['title'],
-        'pile': lambda x: [x['text']]
-    }
+    transform_dict = {"wudao": lambda x: x["title"], "pile": lambda x: [x["text"]]}
    data_iter = create_data_iter(paths, transform_dict=transform_dict)
    for i, data in enumerate(data_iter):
        print(i, data)
        if i == 20:
-            break
+            break
--- a/dataset/pretrain_dataset.py
+++ b/dataset/pretrain_dataset.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-17 20:41:25
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,76 +7,95 @@ FilePath: /Open-Llama/dataset/pretrain_dataset.py
 Description: 

 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import math
 import torch

+
 def preprocess_wudao_gen(tokenizer, segment_max_length=1024):
    def preprocess_wudao(line):
-        '''
+        """
        The format of the data is roughly as follows.
        {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
        Split the data based on the tokenized length according to the maximum length.
-        '''
-        total = line['title'] + '\n' + line['content']
+        """
+        total = line["title"] + "\n" + line["content"]
        out = tokenizer(total)
-        input_ids = out['input_ids']
-        return [input_ids[i*segment_max_length: (i+1)*segment_max_length] 
-        for i in range(math.ceil(len(input_ids)/segment_max_length))]
+        input_ids = out["input_ids"]
+        return [
+            input_ids[i * segment_max_length : (i + 1) * segment_max_length]
+            for i in range(math.ceil(len(input_ids) / segment_max_length))
+        ]
+
    return preprocess_wudao

+
 def preprocess_the_pile_gen(tokenizer, segment_max_length=1024):
    def preprocess_the_pile(line):
-        '''
+        """
        The format of the data is roughly as follows.
        {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
        Split the data based on the tokenized length according to the maximum length.
-        '''
-        total = line['text']
+        """
+        total = line["text"]
        out = tokenizer(total)
-        input_ids = out['input_ids']
-        return [input_ids[i*segment_max_length: (i+1)*segment_max_length] 
-        for i in range(math.ceil(len(input_ids)/segment_max_length))]
+        input_ids = out["input_ids"]
+        return [
+            input_ids[i * segment_max_length : (i + 1) * segment_max_length]
+            for i in range(math.ceil(len(input_ids) / segment_max_length))
+        ]
+
    return preprocess_the_pile

+
 def pretrain_collate_fn_gen(tokenizer, segment_max_length=1024):
-    '''
+    """
    Organize data into tensors by padding based on the preset maximum length.
-    '''
+    """
    pad_id = tokenizer.pad_id
+
    def pretrain_collate_fn(batch):
        input_ids = []
        for i in batch:
            input_len = len(i)
-            input_ids.append(i+[pad_id]*(segment_max_length-input_len))
+            input_ids.append(i + [pad_id] * (segment_max_length - input_len))
        inputs = {
-            'input_ids': torch.tensor(input_ids, dtype=torch.int64),
+            "input_ids": torch.tensor(input_ids, dtype=torch.int64),
        }
        return inputs
+
    return pretrain_collate_fn

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    import sentencepiece as spm
    from datasets import IterableDataset
    from torch.utils.data import DataLoader

    from dataset.tokenizer import Tokenizer
    from dataset.data_iter import create_shard_kwargs, create_data_iter
-    
-    sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
+
+    sp_model = spm.SentencePieceProcessor(
+        model_file="configs/10w_vocab_wudao5_pile10.model"
+    )
    tokenizer = Tokenizer(sp_model)
-    patterns = [
-        'data/pretrain_data/part-*.jsonl.zst'
-    ]
+    patterns = ["data/pretrain_data/part-*.jsonl.zst"]
    paths = create_shard_kwargs(patterns)
    transform_dict = {
-        'wudao': preprocess_wudao_gen(tokenizer), 
-        'pile': preprocess_the_pile_gen(tokenizer)
+        "wudao": preprocess_wudao_gen(tokenizer),
+        "pile": preprocess_the_pile_gen(tokenizer),
    }
-    data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={'paths': paths, 'transform_dict': transform_dict})
-    train_loader = DataLoader(data_set, batch_size=8, num_workers=4, 
-    collate_fn=pretrain_collate_fn_gen(tokenizer), drop_last=True)
+    data_set = IterableDataset.from_generator(
+        create_data_iter, gen_kwargs={"paths": paths, "transform_dict": transform_dict}
+    )
+    train_loader = DataLoader(
+        data_set,
+        batch_size=8,
+        num_workers=4,
+        collate_fn=pretrain_collate_fn_gen(tokenizer),
+        drop_last=True,
+    )
    for batch in train_loader:
        for k, v in batch.items():
            print(k, v.shape)
-        break
+        break
--- a/dataset/tokenizer.py
+++ b/dataset/tokenizer.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-20 21:39:47
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,9 +7,10 @@ FilePath: /Open-Llama/dataset/tokenizer.py
 Description: 

 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import torch

+
 class Tokenizer:
    def __init__(self, sp_model):
        self.sp_model = sp_model
@ -18,90 +19,130 @@ class Tokenizer:
        self.pad_id = self.sp_model.pad_id()
        self.vocab_size = self.sp_model.vocab_size()

-    def __call__(self, inputs, padding=None, max_length=256, return_tensors=False, truncation=False, 
-                 add_special_tokens=True, return_mask=False):
+    def __call__(
+        self,
+        inputs,
+        padding=None,
+        max_length=256,
+        return_tensors=False,
+        truncation=False,
+        add_special_tokens=True,
+        return_mask=False,
+    ):
        if isinstance(inputs, str):
-            return self.encode(inputs, padding=padding, max_length=max_length, 
-            return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
+            return self.encode(
+                inputs,
+                padding=padding,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                truncation=truncation,
+                add_special_tokens=add_special_tokens,
+                return_mask=return_mask,
+            )
        else:
-            return self.encode_batch(inputs, padding=padding, max_length=max_length, 
-            return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
+            return self.encode_batch(
+                inputs,
+                padding=padding,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                truncation=truncation,
+                add_special_tokens=add_special_tokens,
+                return_mask=return_mask,
+            )

-    def encode(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, 
-               add_special_tokens=True, return_mask=False):
-        assert(isinstance(inputs, str))
+    def encode(
+        self,
+        inputs,
+        padding=None,
+        max_length=8192,
+        return_tensors=False,
+        truncation=False,
+        add_special_tokens=True,
+        return_mask=False,
+    ):
+        assert isinstance(inputs, str)
        input_ids = self.sp_model.Encode(inputs)
        if return_mask:
            attention_mask = [1] * len(input_ids)
        if truncation:
            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780
            # 参考Transformer中的实现 默认最后一位一定是pad或者eos
-            input_ids = input_ids[: max_length-1]
+            input_ids = input_ids[: max_length - 1]
            if return_mask:
-                attention_mask = attention_mask[: max_length-1]
+                attention_mask = attention_mask[: max_length - 1]
        if add_special_tokens:
            input_ids = input_ids + [self.eos_id]
            if return_mask:
                attention_mask = attention_mask + [0]
-        if padding == 'max_length':
-            input_ids = input_ids + [self.pad_id] * (max_length-len(input_ids))
+        if padding == "max_length":
+            input_ids = input_ids + [self.pad_id] * (max_length - len(input_ids))
            if return_mask:
-                attention_mask = attention_mask + [0] * (max_length-len(attention_mask))
+                attention_mask = attention_mask + [0] * (
+                    max_length - len(attention_mask)
+                )
        if return_tensors:
            input_ids = torch.tensor([input_ids])
            out = {
-                'input_ids': input_ids,
+                "input_ids": input_ids,
            }
            if return_mask:
                attention_mask = torch.tensor([attention_mask])
-                out['attention_mask'] = attention_mask
+                out["attention_mask"] = attention_mask
        else:
            out = {
-                'input_ids': input_ids,
+                "input_ids": input_ids,
            }
            if return_mask:
-                out['attention_mask'] = attention_mask
+                out["attention_mask"] = attention_mask
        return out

-    def encode_batch(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, 
-                     add_special_tokens=True, return_mask=False):
+    def encode_batch(
+        self,
+        inputs,
+        padding=None,
+        max_length=8192,
+        return_tensors=False,
+        truncation=False,
+        add_special_tokens=True,
+        return_mask=False,
+    ):
        input_ids = self.sp_model.Encode(inputs)
        if return_mask:
            attention_mask = [[1] * len(i) for i in input_ids]
        if truncation:
-            input_ids = [i[: max_length-1] for i in input_ids]
+            input_ids = [i[: max_length - 1] for i in input_ids]
            if return_mask:
-                attention_mask = [i[: max_length-1] for i in attention_mask]
+                attention_mask = [i[: max_length - 1] for i in attention_mask]
        if add_special_tokens:
-            input_ids = [i+[self.eos_id] for i in input_ids]
+            input_ids = [i + [self.eos_id] for i in input_ids]
            if return_mask:
-                attention_mask = [i+[0] for i in attention_mask]
-        if padding == 'max_length':
+                attention_mask = [i + [0] for i in attention_mask]
+        if padding == "max_length":
            input_ids_pad = []
            if return_mask:
                attention_mask_pad = []
            for idx, i in enumerate(input_ids):
-                input_ids_pad.append(i + [self.pad_id] * (max_length-len(i)))
+                input_ids_pad.append(i + [self.pad_id] * (max_length - len(i)))
                if return_mask:
                    j = attention_mask[idx]
-                    attention_mask_pad.append(j + [0] * (max_length-len(j)))
+                    attention_mask_pad.append(j + [0] * (max_length - len(j)))
            input_ids = input_ids_pad
            if return_mask:
                attention_mask = attention_mask_pad
        if return_tensors:
            input_ids = torch.tensor(input_ids)
            out = {
-                'input_ids': input_ids,
+                "input_ids": input_ids,
            }
            if return_mask:
                attention_mask = torch.tensor(attention_mask)
-                out['attention_mask'] = attention_mask
+                out["attention_mask"] = attention_mask
        else:
            out = {
-                'input_ids': input_ids,
+                "input_ids": input_ids,
            }
            if return_mask:
-                out['attention_mask'] = attention_mask
+                out["attention_mask"] = attention_mask
        return out

    def decode(self, inputs):
@ -110,37 +151,48 @@ class Tokenizer:
        for i in inputs:
            if self.eos_id in i:
                eos_idx = i.index(self.eos_id)
-                i = i[: eos_idx]
+                i = i[:eos_idx]
            out.append(i)
        out = self.sp_model.Decode(out)
        return out

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    import sentencepiece as spm
    from unicodedata import normalize
+
    # Using sentencepiece may not be able to process some reserved keywords like '▁'.
-    sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
+    sp_model = spm.SentencePieceProcessor(
+        model_file="configs/10w_vocab_wudao5_pile10.model"
+    )
    tokenizer = Tokenizer(sp_model)
-    tmp = ['hello world', 
-    '这是开源项目的V1版本，this is the first version of a open-source project!', 
-    '# this is a python script\nfor i in range(10):\n   print(i)\n   for j in range(10):\n       print(j)']
+    tmp = [
+        "hello world",
+        "这是开源项目的V1版本，this is the first version of a open-source project!",
+        "# this is a python script\nfor i in range(10):\n   print(i)\n   for j in range(10):\n       print(j)",
+    ]
    print(tmp)
-    out = tokenizer(tmp, padding='max_length', return_tensors=True, max_length=64, truncation=True)
+    out = tokenizer(
+        tmp, padding="max_length", return_tensors=True, max_length=64, truncation=True
+    )
    for k, v in out.items():
        print(k, v.shape)
-    print(out['input_ids'])
-    out = tokenizer.decode(out['input_ids'])
+    print(out["input_ids"])
+    out = tokenizer.decode(out["input_ids"])
    print(out)
    for i, j in zip(tmp, out):
-        assert(normalize('NFKC', i) == j)
+        assert normalize("NFKC", i) == j

    from dataset.data_iter import create_shard_kwargs, create_data_iter
-    patterns = [
-        'data/pretrain_data/part-wudao*.jsonl.zst'
-    ]
+
+    patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
    paths = create_shard_kwargs(patterns)
    data_iter = create_data_iter(paths)
    for i, data in enumerate(data_iter):
-        assert(normalize('NFKC', data['content']) == sp_model.Decode(sp_model.Encode(data['content'])) or '▁' in data['content'])
+        assert (
+            normalize("NFKC", data["content"])
+            == sp_model.Decode(sp_model.Encode(data["content"]))
+            or "▁" in data["content"]
+        )
        if i == 1000:
-            break
+            break
--- a/dataset/train_tokenizer.py
+++ b/dataset/train_tokenizer.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-24 20:49:03
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,25 +7,25 @@ FilePath: /Open-Llama/dataset/train_tokenizer.py
 Description: 

 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import random
 from dataset.data_iter import create_data_iter, create_shard_kwargs

 wudao_patterns = [
-    'data/pretrain_data/part-wudao-*.jsonl.zst',
+    "data/pretrain_data/part-wudao-*.jsonl.zst",
 ]
 wudao_paths = create_shard_kwargs(wudao_patterns)
 random.shuffle(wudao_paths)

 pile_patterns = [
-    'data/pretrain_data/part-pile-*.jsonl.zst',
+    "data/pretrain_data/part-pile-*.jsonl.zst",
 ]
 pile_paths = create_shard_kwargs(pile_patterns)
 random.shuffle(pile_paths)
-paths = wudao_paths[: 5] + pile_paths[: 10]
+paths = wudao_paths[:5] + pile_paths[:10]
 transform_dict = {
-    'wudao': lambda line: [(line['title'] + '\n' + line['content'])],
-    'pile': lambda line: [line['text']]
+    "wudao": lambda line: [(line["title"] + "\n" + line["content"])],
+    "pile": lambda line: [line["text"]],
 }
 data_iter = create_data_iter(paths, transform_dict)

@ -35,19 +35,30 @@ import sentencepiece as spm
 # Loads model from URL as iterator and stores the model to BytesIO.
 model = io.BytesIO()
 spm.SentencePieceTrainer.train(
-  sentence_iterator=data_iter, model_writer=model, shuffle_input_sentence=False, train_extremely_large_corpus=True, 
-  # hyperparameters of tokenizer
-  max_sentence_length=16384, pad_id=3, model_type='BPE', vocab_size=100000, 
-  # split digits and fallback to byte same as Llama. 
-  # set split_by_unicode_script to True to avoid grouping punctuation and characters together.
-  split_digits=True, split_by_unicode_script=True, byte_fallback=True,
-  # reserve whitespace and \n and \t etc. for code generation
-  allow_whitespace_only_pieces=True, remove_extra_whitespaces=False, normalization_rule_name='nfkc')
+    sentence_iterator=data_iter,
+    model_writer=model,
+    shuffle_input_sentence=False,
+    train_extremely_large_corpus=True,
+    # hyperparameters of tokenizer
+    max_sentence_length=16384,
+    pad_id=3,
+    model_type="BPE",
+    vocab_size=100000,
+    # split digits and fallback to byte same as Llama.
+    # set split_by_unicode_script to True to avoid grouping punctuation and characters together.
+    split_digits=True,
+    split_by_unicode_script=True,
+    byte_fallback=True,
+    # reserve whitespace and \n and \t etc. for code generation
+    allow_whitespace_only_pieces=True,
+    remove_extra_whitespaces=False,
+    normalization_rule_name="nfkc",
+)

 # Serialize the model as file.
-with open('configs/10w_vocab_wudao5_pile10.model', 'wb') as f:
-  f.write(model.getvalue())
+with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f:
+    f.write(model.getvalue())

 # Directly load the model from serialized model.
 sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
-print(sp.decode(sp.encode('只因你太美🤗▃     \n  1')))
+print(sp.decode(sp.encode("只因你太美🤗▃     \n  1")))
--- a/dataset/validation.py
+++ b/dataset/validation.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-18 00:06:41
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,21 +7,21 @@ FilePath: /Open-Llama/dataset/validation.py
 Description: 

 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 val_set = [
-    '白日依山尽，',
-    '君不见，黄河之水天上来，奔流到海不复回。君不见，',
-    '秦孝公据崤函之固，拥雍州之地，君臣固守以窥周室，有席卷天下，包举宇内，囊括四海之意，并吞八荒之心。',
-    '古之学者必有师。师者，所以传道受业解惑也。人非生而知之者，孰能无惑？',
-    '当我醒来时，我发现自己在一个完全陌生的地方。我看到周围没有人，只有一张纸条。',
-    '这是一个斗气决定一切的大陆。在加玛帝国乌坦城，有个天才少年萧炎打破了所有族人的修炼纪录，一时间万人敬仰，众人艳羡。但不知为何，',
-    '人工智能技术在图像识别领域取得了很大的进展，然而在复杂场景下仍然存在一些问题，例如',
-    'In recent years, there has been increasing interest in the use of machine learning to',
-    '已知三个数分别为1, 2, 3，则它们的平均数是',
-    '小明总共有15个苹果，他分别给了3个人两个苹果，然后自己又吃了一个苹果，那么它还剩几个苹果？',
-    '根据牛顿第二定律，物体的加速度等于',
-    '碳纳米管是一种新型的材料，具有非常独特的电学和光学性质。在过去的几年中，我们对碳纳',
-    '下面是一段用python写的快速排序的代码:',
-    'The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on',
-    '下面是一个使用 PyTorch 和 Transformer 的示例代码，用于训练一个文本分类模型：import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset'
-]
+    "白日依山尽，",
+    "君不见，黄河之水天上来，奔流到海不复回。君不见，",
+    "秦孝公据崤函之固，拥雍州之地，君臣固守以窥周室，有席卷天下，包举宇内，囊括四海之意，并吞八荒之心。",
+    "古之学者必有师。师者，所以传道受业解惑也。人非生而知之者，孰能无惑？",
+    "当我醒来时，我发现自己在一个完全陌生的地方。我看到周围没有人，只有一张纸条。",
+    "这是一个斗气决定一切的大陆。在加玛帝国乌坦城，有个天才少年萧炎打破了所有族人的修炼纪录，一时间万人敬仰，众人艳羡。但不知为何，",
+    "人工智能技术在图像识别领域取得了很大的进展，然而在复杂场景下仍然存在一些问题，例如",
+    "In recent years, there has been increasing interest in the use of machine learning to",
+    "已知三个数分别为1, 2, 3，则它们的平均数是",
+    "小明总共有15个苹果，他分别给了3个人两个苹果，然后自己又吃了一个苹果，那么它还剩几个苹果？",
+    "根据牛顿第二定律，物体的加速度等于",
+    "碳纳米管是一种新型的材料，具有非常独特的电学和光学性质。在过去的几年中，我们对碳纳",
+    "下面是一段用python写的快速排序的代码:",
+    "The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on",
+    "下面是一个使用 PyTorch 和 Transformer 的示例代码，用于训练一个文本分类模型：import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset",
+]
--- a/models/llama.py
+++ b/models/llama.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-17 13:21:33
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -9,4 +9,4 @@ Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf
 Performance and effectiveness optimization based on the implementation in the Transformer library.
 https://github.com/Bayes-Song/transformers
 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
--- a/pretrain_llama.py
+++ b/pretrain_llama.py
@ -1,4 +1,4 @@
-'''
+"""
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-17 14:27:28
 LastEditors: LiangSong(sl12160010@gmail.com)
@ -7,7 +7,7 @@ FilePath: /Open-Llama/pretrain_llama.py
 Description: 
 pretrain GPT
 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
-'''
+"""
 import os
 import time
 import wandb
@ -24,15 +24,17 @@ from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with
 from dataset.validation import val_set
 from dataset.tokenizer import Tokenizer
 from dataset.data_iter import create_shard_kwargs, create_data_iter
-from dataset.pretrain_dataset import preprocess_the_pile_gen, preprocess_wudao_gen, pretrain_collate_fn_gen
+from dataset.pretrain_dataset import (
+    preprocess_the_pile_gen,
+    preprocess_wudao_gen,
+    pretrain_collate_fn_gen,
+)
 from configs.train_config import *

 accelerator = Accelerator()

 if accelerator.is_main_process:
-    wandb.init(
-        project='LLAMA Pretrain'
-    )
+    wandb.init(project="LLAMA Pretrain")

 log_interval *= accelerator.gradient_accumulation_steps
 eval_interval *= accelerator.gradient_accumulation_steps
@ -44,51 +46,74 @@ tokenizer = Tokenizer(sp_model)
 paths = create_shard_kwargs(patterns)
 random.shuffle(paths)
 transform_dict = {
-    'wudao': preprocess_wudao_gen(tokenizer, max_length), 
-    'pile': preprocess_the_pile_gen(tokenizer, max_length)
+    "wudao": preprocess_wudao_gen(tokenizer, max_length),
+    "pile": preprocess_the_pile_gen(tokenizer, max_length),
 }
-data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={
-    'paths': paths, 
-    'transform_dict': transform_dict,
-    'process_index': accelerator.process_index, 
-    'num_processes': accelerator.num_processes
-})
-train_loader = DataLoader(data_set, batch_size=train_batch_size, num_workers=1, 
-collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), drop_last=True)
+data_set = IterableDataset.from_generator(
+    create_data_iter,
+    gen_kwargs={
+        "paths": paths,
+        "transform_dict": transform_dict,
+        "process_index": accelerator.process_index,
+        "num_processes": accelerator.num_processes,
+    },
+)
+train_loader = DataLoader(
+    data_set,
+    batch_size=train_batch_size,
+    num_workers=1,
+    collate_fn=pretrain_collate_fn_gen(tokenizer, max_length),
+    drop_last=True,
+)
 # smaller initializer_range make training more stable
 # add stabel embedding to token embedding
-raw_model = LlamaForCausalLM(LlamaConfig(vocab_size=tokenizer.vocab_size, 
-                                         initializer_range=initializer_range, 
-                                         pad_token_id=tokenizer.pad_id, 
-                                         rms_norm_eps=1e-5, 
-                                         hidden_dropout_prob=0.1, 
-                                         attention_dropout_prob=0.1, 
-                                         use_stable_embedding=True, 
-                                         shared_input_output_embedding=True))
+raw_model = LlamaForCausalLM(
+    LlamaConfig(
+        vocab_size=tokenizer.vocab_size,
+        initializer_range=initializer_range,
+        pad_token_id=tokenizer.pad_id,
+        rms_norm_eps=1e-5,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        use_stable_embedding=True,
+        shared_input_output_embedding=True,
+    )
+)
 raw_model.eval()
 with torch.no_grad():
    summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
 no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
 optimizer_grouped_parameters = [
    {
-        "params": [p for n, p in raw_model.named_parameters() if not any(nd in n for nd in no_decay)],
+        "params": [
+            p
+            for n, p in raw_model.named_parameters()
+            if not any(nd in n for nd in no_decay)
+        ],
        "weight_decay": weight_decay,
    },
    {
-        "params": [p for n, p in raw_model.named_parameters() if any(nd in n for nd in no_decay)],
+        "params": [
+            p
+            for n, p in raw_model.named_parameters()
+            if any(nd in n for nd in no_decay)
+        ],
        "weight_decay": 0.0,
    },
 ]
 optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
 optim.zero_grad()
 factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
-scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps * factor, 
-                                            num_training_steps=num_training_steps * factor)
+scheduler = get_cosine_schedule_with_warmup(
+    optim,
+    num_warmup_steps=num_warmup_steps * factor,
+    num_training_steps=num_training_steps * factor,
+)

 _, model, optim, scheduler = accelerator.prepare(
    train_loader, raw_model, optim, scheduler
 )
-print('start training...')
+print("start training...")
 train_loader_iter = iter(train_loader)
 global_step = 0
 start_time = time.time()
@ -98,13 +123,11 @@ for data_step in range(num_training_steps):
        batch = next(train_loader_iter)
        for k, v in batch.items():
            batch[k] = v.to(accelerator.device)
-        labels = batch['input_ids'].clone()
-        labels[labels==tokenizer.pad_id] = -100
+        labels = batch["input_ids"].clone()
+        labels[labels == tokenizer.pad_id] = -100
        out = model(**batch, labels=labels)
        total_loss = out.loss
-        losses = {
-            'total_loss': total_loss
-        }
+        losses = {"total_loss": total_loss}
        accelerator.backward(total_loss)
        optim.step()
        scheduler.step()
@ -115,34 +138,41 @@ for data_step in range(num_training_steps):
        cost_time = time.time() - start_time
        start_time = time.time()
        tokens = train_batch_size * log_interval * max_length
-        wandb.log({'Training/Token per second per gpu': tokens/cost_time})
+        wandb.log({"Training/Token per second per gpu": tokens / cost_time})
        for k, v in losses.items():
-            wandb.log({'Losses/{}'.format(k): v})
-        current_lr = optim.param_groups[0]['lr']
-        wandb.log({'Training/LR': current_lr})
+            wandb.log({"Losses/{}".format(k): v})
+        current_lr = optim.param_groups[0]["lr"]
+        wandb.log({"Training/LR": current_lr})
        if optim.scaler is not None:
-            wandb.log({'Training/Loss Scale': optim.scaler.get_scale()})
-        wandb.log({'Training/Data Step': data_step})
-        wandb.log({'Training/Global Step': global_step})
-        accelerator.print('Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}'.format(
-            global_step, data_step, losses['total_loss'], tokens/cost_time))
+            wandb.log({"Training/Loss Scale": optim.scaler.get_scale()})
+        wandb.log({"Training/Data Step": data_step})
+        wandb.log({"Training/Global Step": global_step})
+        accelerator.print(
+            "Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format(
+                global_step, data_step, losses["total_loss"], tokens / cost_time
+            )
+        )
    if data_step % eval_interval == 0 and accelerator.is_main_process:
-        text_table = wandb.Table(columns=['question', 'pred'])
+        text_table = wandb.Table(columns=["question", "pred"])
        model.eval()
        with torch.no_grad():
            for data in val_set:
                raw_inputs = data
                inputs_len = len(raw_inputs)
-                inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False)
+                inputs = tokenizer(
+                    raw_inputs, return_tensors=True, add_special_tokens=False
+                )
                for k, v in inputs.items():
                    inputs[k] = v.to(accelerator.device)
-                pred = model.generate(**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0)
+                pred = model.generate(
+                    **inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0
+                )
                pred = tokenizer.decode(pred.cpu())[0]
                pred = pred[inputs_len:]
                text_table.add_data(raw_inputs, pred)
-        wandb.log({'Predictions on {}'.format(global_step) : text_table})
+        wandb.log({"Predictions on {}".format(global_step): text_table})
    if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
        if not os.path.isdir(work_dir):
            os.mkdir(work_dir)
-        torch.save(raw_model.state_dict(), '{}/{}.pt'.format(work_dir, global_step))
+        torch.save(raw_model.state_dict(), "{}/{}.pt".format(work_dir, global_step))
 wandb.finish()