add high-performance Llama pre-train code

2023-03-26 23:59:53 +08:00 · 2023-03-26 23:59:53 +08:00 · 73a81a4205
commit 73a81a4205
parent 0fa15787b4
18 changed files with 858 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+.DS_Store
+pretrain_data/
+wandb/
--- a/configs/10w_vocab_wudao5_pile10.model
+++ b/configs/10w_vocab_wudao5_pile10.model
--- a/configs/6w_vocab_wudao5_pile10.model
+++ b/configs/6w_vocab_wudao5_pile10.model
--- a/configs/default_config.yaml
+++ b/configs/default_config.yaml
@ -0,0 +1,30 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 12
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_backend: 'no'
+# dynamo_config: 
+  # dynamo_backend: INDUCTOR
+  # dynamo_mode: default
+  # dynamo_use_dynamic: true
+  # dynamo_use_fullgraph: false
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/configs/llama_tokenizer.model
+++ b/configs/llama_tokenizer.model
--- a/configs/train_config.py
+++ b/configs/train_config.py
@ -0,0 +1,16 @@
+max_length = 1024
+train_batch_size = 2
+num_training_steps = 1000000
+num_warmup_steps = 2000
+initializer_range = 1e-2
+lr = 2e-4
+weight_decay = 1e-1
+tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model'
+patterns = [
+    'data/pretrain_data/part-*.jsonl.zst'
+]
+# global step
+log_interval = 5
+eval_interval = 200
+save_interval = 800
+work_dir = 'data/saved_ckpt/'
--- a/data/download_the_pile.sh
+++ b/data/download_the_pile.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+###
+ # @Author: LiangSong(sl12160010@gmail.com)
+ # @Date: 2023-03-16 21:21:38
+ # @LastEditors: LiangSong(sl12160010@gmail.com)
+ # @LastEditTime: 2023-03-26 22:58:02
+ # @FilePath: /Open-Llama/data/download_the_pile.sh
+ # @Description: 
+ # download the pile dataset and preprocess
+ # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+### 
+start=0
+end=29
+mkdir data/the_pile
+for (( i=$start; i<=$end; i++ ))
+do
+    url="https://the-eye.eu/public/AI/pile/train/$(printf "%02d" $i).jsonl.zst"
+    echo "Downloading file: $url"
+    curl -C - $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst"
+done
+
+wait
+
+echo "All files downloaded successfully."
+mkdir data/pretrain_data
+python3 data/preprocess_the_pile.py
--- a/data/download_wudao.sh
+++ b/data/download_wudao.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+###
+ # @Author: LiangSong(sl12160010@gmail.com)
+ # @Date: 2023-03-16 21:21:56
+ # @LastEditors: LiangSong(sl12160010@gmail.com)
+ # @LastEditTime: 2023-03-26 22:58:11
+ # @FilePath: /Open-Llama/data/download_wudao.sh
+ # @Description: 
+ # download wudao dataset and preprocess
+ # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+### 
+apt install unrar
+for i in {1..100}
+do
+  curl -C - --retry 100 'https://dorc.baai.ac.cn/resources/data/WuDaoCorpora2.0/WuDaoCorpus2.0_base_200G.rar?AccessKeyId=AKLTNasiLRBBTcOgPqzlkPzu1w&Expires=1679127659&Signature=7jh%2FpnJyC2hAeumm9EjaeE5HN9E%3D' -o data/WuDaoCorpus2.0_base_200G.rar
+done
+unrar x data/WuDaoCorpus2.0_base_200G.rar
+mkdir data/pretrain_data
+python3 data/preprocess_wudao.py
--- a/data/preprocess_the_pile.py
+++ b/data/preprocess_the_pile.py
@ -0,0 +1,32 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-16 22:35:38
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 22:59:38
+FilePath: /Open-Llama/data/preprocess_the_pile.py
+Description: 
+Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 
+making it easy for parallel training to perform streaming reads.
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import json
+from glob import glob
+from tqdm import tqdm
+import zstandard as zstd
+
+paths = glob('data/the_pile/*.jsonl.zst')
+write_path = 'data/pretrain_data/part-pile-{}.jsonl.zst'
+total_num = 0
+file_num = 0
+wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
+for path in tqdm(paths, total=len(paths)):
+    with zstd.open(path, 'r', encoding='utf-8') as fp:
+        for line in fp:
+            if total_num % 16384 == 0 and total_num > 0:
+                file_num += 1
+                wfp.close()
+                wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
+            wfp.write(line.encode('utf-8'))
+            total_num += 1
+wfp.close()
+print('total line: {}\ntotal files: {}'.format(total_num, file_num))
--- a/data/preprocess_wudao.py
+++ b/data/preprocess_wudao.py
@ -0,0 +1,34 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-16 22:10:44
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 22:59:55
+FilePath: /Open-Llama/data/preprocess_wudao.py
+Description: 
+Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 
+making it easy for parallel training to perform streaming reads.
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import json
+from glob import glob
+from tqdm import tqdm
+import zstandard as zstd
+
+paths = glob('data/WuDaoCorpus2.0_base_200G/part*')
+write_path = 'data/pretrain_data/part-wudao-{}.jsonl.zst'
+total_num = 0
+file_num = 0
+wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
+for path in tqdm(paths, total=len(paths)):
+    with open(path, 'r') as fp:
+        data = json.load(fp)
+    for line in data:
+        if total_num % 16384 == 0 and total_num > 0:
+            file_num += 1
+            wfp.close()
+            wfp = zstd.open(write_path.format(file_num), 'wb', encoding='utf-8')
+        wfp.write(json.dumps(line).encode('utf-8'))
+        wfp.write('\n'.encode('utf-8'))
+        total_num += 1
+wfp.close()
+print('total line: {}\ntotal files: {}'.format(total_num, file_num))
--- a/dataset/data_iter.py
+++ b/dataset/data_iter.py
@ -0,0 +1,92 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-17 19:32:20
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 23:03:32
+FilePath: /Open-Llama/dataset/data_iter.py
+Description: 
+
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import json
+from glob import glob
+import zstandard as zstd
+
+
+def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1):
+    '''
+    Currently, the allowed storage formats are jsonl and jsonl.zst. 
+    Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
+    '''
+    past = None
+    for i, path in paths:
+        dataset_name = path.split('-')[-2]
+        if past != dataset_name:
+            print('Loading data from {}'.format(path))
+            past = path
+        if num_processes > 1 and i % num_processes != process_index:
+            continue
+        if path.endswith('jsonl.zst'):
+             with zstd.open(path, 'r', encoding='utf-8') as fp:
+                 for line in fp:
+                    if isinstance(line, bytes):
+                        line = line.decode('utf-8')
+                    line = json.loads(line)
+                    line['dataset'] = dataset_name
+                    if transform_dict:
+                        line = transform_dict[dataset_name](line)
+                        if isinstance(line, str):
+                            yield line
+                        elif isinstance(line, list):
+                            for i in line:
+                                yield i
+                        else:
+                            raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
+                    else:
+                        yield line
+        elif path.endswith('jsonl'):
+            with open(path, 'r') as fp:
+                for line in fp:
+                    if isinstance(line, bytes):
+                        line = line.decode('utf-8')
+                    line = json.loads(line)
+                    line['dataset'] = dataset_name
+                    if transform_dict:
+                        line = transform_dict[dataset_name](line)
+                        if isinstance(line, str):
+                            yield line
+                        elif isinstance(line, list):
+                            for i in line:
+                                yield i
+                        else:
+                            raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
+                    else:
+                        yield line
+        else:
+            raise Exception('File format of {} is not supported yet.'.format(path))
+
+def create_shard_kwargs(patterns, repeat=1):
+    '''
+    Assign numbers to different shards of data to ensure that data is not duplicated 
+    when allocated to different nodes during distributed training.
+    '''
+    all_path = []
+    for p in patterns:
+        all_path.extend(glob(p))
+    all_path *= repeat
+    return [(i, p) for i, p in enumerate(all_path)]
+
+if __name__ == '__main__':
+    patterns = [
+        'data/pretrain_data/part-wudao*.jsonl.zst'
+    ]
+    paths = create_shard_kwargs(patterns)
+    transform_dict = {
+        'wudao': lambda x: x['title'],
+        'pile': lambda x: [x['text']]
+    }
+    data_iter = create_data_iter(paths, transform_dict=transform_dict)
+    for i, data in enumerate(data_iter):
+        print(i, data)
+        if i == 20:
+            break
--- a/dataset/pretrain_dataset.py
+++ b/dataset/pretrain_dataset.py
@ -0,0 +1,82 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-17 20:41:25
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 23:07:56
+FilePath: /Open-Llama/dataset/pretrain_dataset.py
+Description: 
+
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import math
+import torch
+
+def preprocess_wudao_gen(tokenizer, segment_max_length=1024):
+    def preprocess_wudao(line):
+        '''
+        The format of the data is roughly as follows.
+        {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
+        Split the data based on the tokenized length according to the maximum length.
+        '''
+        total = line['title'] + '\n' + line['content']
+        out = tokenizer(total)
+        input_ids = out['input_ids']
+        return [input_ids[i*segment_max_length: (i+1)*segment_max_length] 
+        for i in range(math.ceil(len(input_ids)/segment_max_length))]
+    return preprocess_wudao
+
+def preprocess_the_pile_gen(tokenizer, segment_max_length=1024):
+    def preprocess_the_pile(line):
+        '''
+        The format of the data is roughly as follows.
+        {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
+        Split the data based on the tokenized length according to the maximum length.
+        '''
+        total = line['text']
+        out = tokenizer(total)
+        input_ids = out['input_ids']
+        return [input_ids[i*segment_max_length: (i+1)*segment_max_length] 
+        for i in range(math.ceil(len(input_ids)/segment_max_length))]
+    return preprocess_the_pile
+
+def pretrain_collate_fn_gen(tokenizer, segment_max_length=1024):
+    '''
+    Organize data into tensors by padding based on the preset maximum length.
+    '''
+    pad_id = tokenizer.pad_id
+    def pretrain_collate_fn(batch):
+        input_ids = []
+        for i in batch:
+            input_len = len(i)
+            input_ids.append(i+[pad_id]*(segment_max_length-input_len))
+        inputs = {
+            'input_ids': torch.tensor(input_ids, dtype=torch.int64),
+        }
+        return inputs
+    return pretrain_collate_fn
+
+if __name__ == '__main__':
+    import sentencepiece as spm
+    from datasets import IterableDataset
+    from torch.utils.data import DataLoader
+
+    from dataset.tokenizer import Tokenizer
+    from dataset.data_iter import create_shard_kwargs, create_data_iter
+    
+    sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
+    tokenizer = Tokenizer(sp_model)
+    patterns = [
+        'data/pretrain_data/part-*.jsonl.zst'
+    ]
+    paths = create_shard_kwargs(patterns)
+    transform_dict = {
+        'wudao': preprocess_wudao_gen(tokenizer), 
+        'pile': preprocess_the_pile_gen(tokenizer)
+    }
+    data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={'paths': paths, 'transform_dict': transform_dict})
+    train_loader = DataLoader(data_set, batch_size=8, num_workers=4, 
+    collate_fn=pretrain_collate_fn_gen(tokenizer), drop_last=True)
+    for batch in train_loader:
+        for k, v in batch.items():
+            print(k, v.shape)
+        break
--- a/dataset/tokenizer.py
+++ b/dataset/tokenizer.py
@ -0,0 +1,146 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-20 21:39:47
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 23:09:39
+FilePath: /Open-Llama/dataset/tokenizer.py
+Description: 
+
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import torch
+
+class Tokenizer:
+    def __init__(self, sp_model):
+        self.sp_model = sp_model
+        self.bos_id = self.sp_model.bos_id()
+        self.eos_id = self.sp_model.eos_id()
+        self.pad_id = self.sp_model.pad_id()
+        self.vocab_size = self.sp_model.vocab_size()
+
+    def __call__(self, inputs, padding=None, max_length=256, return_tensors=False, truncation=False, 
+                 add_special_tokens=True, return_mask=False):
+        if isinstance(inputs, str):
+            return self.encode(inputs, padding=padding, max_length=max_length, 
+            return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
+        else:
+            return self.encode_batch(inputs, padding=padding, max_length=max_length, 
+            return_tensors=return_tensors, truncation=truncation, add_special_tokens=add_special_tokens, return_mask=return_mask)
+
+    def encode(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, 
+               add_special_tokens=True, return_mask=False):
+        assert(isinstance(inputs, str))
+        input_ids = self.sp_model.Encode(inputs)
+        if return_mask:
+            attention_mask = [1] * len(input_ids)
+        if truncation:
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780
+            # 参考Transformer中的实现 默认最后一位一定是pad或者eos
+            input_ids = input_ids[: max_length-1]
+            if return_mask:
+                attention_mask = attention_mask[: max_length-1]
+        if add_special_tokens:
+            input_ids = input_ids + [self.eos_id]
+            if return_mask:
+                attention_mask = attention_mask + [0]
+        if padding == 'max_length':
+            input_ids = input_ids + [self.pad_id] * (max_length-len(input_ids))
+            if return_mask:
+                attention_mask = attention_mask + [0] * (max_length-len(attention_mask))
+        if return_tensors:
+            input_ids = torch.tensor([input_ids])
+            out = {
+                'input_ids': input_ids,
+            }
+            if return_mask:
+                attention_mask = torch.tensor([attention_mask])
+                out['attention_mask'] = attention_mask
+        else:
+            out = {
+                'input_ids': input_ids,
+            }
+            if return_mask:
+                out['attention_mask'] = attention_mask
+        return out
+
+    def encode_batch(self, inputs, padding=None, max_length=8192, return_tensors=False, truncation=False, 
+                     add_special_tokens=True, return_mask=False):
+        input_ids = self.sp_model.Encode(inputs)
+        if return_mask:
+            attention_mask = [[1] * len(i) for i in input_ids]
+        if truncation:
+            input_ids = [i[: max_length-1] for i in input_ids]
+            if return_mask:
+                attention_mask = [i[: max_length-1] for i in attention_mask]
+        if add_special_tokens:
+            input_ids = [i+[self.eos_id] for i in input_ids]
+            if return_mask:
+                attention_mask = [i+[0] for i in attention_mask]
+        if padding == 'max_length':
+            input_ids_pad = []
+            if return_mask:
+                attention_mask_pad = []
+            for idx, i in enumerate(input_ids):
+                input_ids_pad.append(i + [self.pad_id] * (max_length-len(i)))
+                if return_mask:
+                    j = attention_mask[idx]
+                    attention_mask_pad.append(j + [0] * (max_length-len(j)))
+            input_ids = input_ids_pad
+            if return_mask:
+                attention_mask = attention_mask_pad
+        if return_tensors:
+            input_ids = torch.tensor(input_ids)
+            out = {
+                'input_ids': input_ids,
+            }
+            if return_mask:
+                attention_mask = torch.tensor(attention_mask)
+                out['attention_mask'] = attention_mask
+        else:
+            out = {
+                'input_ids': input_ids,
+            }
+            if return_mask:
+                out['attention_mask'] = attention_mask
+        return out
+
+    def decode(self, inputs):
+        inputs = inputs.tolist()
+        out = []
+        for i in inputs:
+            if self.eos_id in i:
+                eos_idx = i.index(self.eos_id)
+                i = i[: eos_idx]
+            out.append(i)
+        out = self.sp_model.Decode(out)
+        return out
+
+if __name__ == '__main__':
+    import sentencepiece as spm
+    from unicodedata import normalize
+    # Using sentencepiece may not be able to process some reserved keywords like '▁'.
+    sp_model = spm.SentencePieceProcessor(model_file='configs/10w_vocab_wudao5_pile10.model')
+    tokenizer = Tokenizer(sp_model)
+    tmp = ['hello world', 
+    '这是开源项目的V1版本，this is the first version of a open-source project!', 
+    '# this is a python script\nfor i in range(10):\n   print(i)\n   for j in range(10):\n       print(j)']
+    print(tmp)
+    out = tokenizer(tmp, padding='max_length', return_tensors=True, max_length=64, truncation=True)
+    for k, v in out.items():
+        print(k, v.shape)
+    print(out['input_ids'])
+    out = tokenizer.decode(out['input_ids'])
+    print(out)
+    for i, j in zip(tmp, out):
+        assert(normalize('NFKC', i) == j)
+
+    from dataset.data_iter import create_shard_kwargs, create_data_iter
+    patterns = [
+        'data/pretrain_data/part-wudao*.jsonl.zst'
+    ]
+    paths = create_shard_kwargs(patterns)
+    data_iter = create_data_iter(paths)
+    for i, data in enumerate(data_iter):
+        assert(normalize('NFKC', data['content']) == sp_model.Decode(sp_model.Encode(data['content'])) or '▁' in data['content'])
+        if i == 1000:
+            break
--- a/dataset/train_tokenizer.py
+++ b/dataset/train_tokenizer.py
@ -0,0 +1,53 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-24 20:49:03
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 23:43:59
+FilePath: /Open-Llama/dataset/train_tokenizer.py
+Description: 
+
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import random
+from dataset.data_iter import create_data_iter, create_shard_kwargs
+
+wudao_patterns = [
+    'data/pretrain_data/part-wudao-*.jsonl.zst',
+]
+wudao_paths = create_shard_kwargs(wudao_patterns)
+random.shuffle(wudao_paths)
+
+pile_patterns = [
+    'data/pretrain_data/part-pile-*.jsonl.zst',
+]
+pile_paths = create_shard_kwargs(pile_patterns)
+random.shuffle(pile_paths)
+paths = wudao_paths[: 5] + pile_paths[: 10]
+transform_dict = {
+    'wudao': lambda line: [(line['title'] + '\n' + line['content'])],
+    'pile': lambda line: [line['text']]
+}
+data_iter = create_data_iter(paths, transform_dict)
+
+import io
+import sentencepiece as spm
+
+# Loads model from URL as iterator and stores the model to BytesIO.
+model = io.BytesIO()
+spm.SentencePieceTrainer.train(
+  sentence_iterator=data_iter, model_writer=model, shuffle_input_sentence=False, train_extremely_large_corpus=True, 
+  # hyperparameters of tokenizer
+  max_sentence_length=16384, pad_id=3, model_type='BPE', vocab_size=100000, 
+  # split digits and fallback to byte same as Llama. 
+  # set split_by_unicode_script to True to avoid grouping punctuation and characters together.
+  split_digits=True, split_by_unicode_script=True, byte_fallback=True,
+  # reserve whitespace and \n and \t etc. for code generation
+  allow_whitespace_only_pieces=True, remove_extra_whitespaces=False, normalization_rule_name='nfkc')
+
+# Serialize the model as file.
+with open('configs/10w_vocab_wudao5_pile10.model', 'wb') as f:
+  f.write(model.getvalue())
+
+# Directly load the model from serialized model.
+sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
+print(sp.decode(sp.encode('只因你太美🤗▃     \n  1')))
--- a/dataset/validation.py
+++ b/dataset/validation.py
@ -0,0 +1,17 @@
+val_set = [
+    '白日依山尽，',
+    '君不见，黄河之水天上来，奔流到海不复回。君不见，',
+    '秦孝公据崤函之固，拥雍州之地，君臣固守以窥周室，有席卷天下，包举宇内，囊括四海之意，并吞八荒之心。',
+    '古之学者必有师。师者，所以传道受业解惑也。人非生而知之者，孰能无惑？',
+    '当我醒来时，我发现自己在一个完全陌生的地方。我看到周围没有人，只有一张纸条。',
+    '这是一个斗气决定一切的大陆。在加玛帝国乌坦城，有个天才少年萧炎打破了所有族人的修炼纪录，一时间万人敬仰，众人艳羡。但不知为何，',
+    '人工智能技术在图像识别领域取得了很大的进展，然而在复杂场景下仍然存在一些问题，例如',
+    'In recent years, there has been increasing interest in the use of machine learning to',
+    '已知三个数分别为1, 2, 3，求它们的平均数是',
+    '小明总共有15个苹果，他分别给了3个人两个苹果，然后自己又吃了一个苹果，那么它还剩几个苹果？',
+    '根据牛顿第二定律，物体的加速度等于',
+    '碳纳米管是一种新型的材料，具有非常独特的电学和光学性质。在过去的几年中，我们对碳纳',
+    '下面是一段用python写的快速排序的代码:',
+    'The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on',
+    '下面是一个使用 PyTorch 和 Transformer 的示例代码，用于训练一个文本分类模型：import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset'
+]
--- a/models/llama.py
+++ b/models/llama.py
@ -0,0 +1,12 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-17 13:21:33
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 23:13:57
+FilePath: /Open-Llama/models/llama.py
+Description: 
+Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf
+Performance and effectiveness optimization based on the implementation in the Transformer library.
+https://github.com/Bayes-Song/transformers
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
--- a/pretrain_llama.py
+++ b/pretrain_llama.py
@ -0,0 +1,148 @@
+'''
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-03-17 14:27:28
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-03-26 23:33:41
+FilePath: /Open-Llama/pretrain_llama.py
+Description: 
+pretrain GPT
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+'''
+import os
+import time
+import wandb
+import torch
+import random
+import sentencepiece as spm
+from torchinfo import summary
+from accelerate import Accelerator
+from datasets import IterableDataset
+from torch.utils.data import DataLoader
+from deepspeed.ops.adam import FusedAdam
+from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
+
+from dataset.validation import val_set
+from dataset.tokenizer import Tokenizer
+from dataset.data_iter import create_shard_kwargs, create_data_iter
+from dataset.pretrain_dataset import preprocess_the_pile_gen, preprocess_wudao_gen, pretrain_collate_fn_gen
+from configs.train_config import *
+
+accelerator = Accelerator()
+
+if accelerator.is_main_process:
+    wandb.init(
+        project='LLAMA Pretrain'
+    )
+
+log_interval *= accelerator.gradient_accumulation_steps
+eval_interval *= accelerator.gradient_accumulation_steps
+save_interval *= accelerator.gradient_accumulation_steps
+
+sp_model = spm.SentencePieceProcessor(model_file=tokenizer_model_path)
+tokenizer = Tokenizer(sp_model)
+
+paths = create_shard_kwargs(patterns)
+random.shuffle(paths)
+transform_dict = {
+    'wudao': preprocess_wudao_gen(tokenizer, max_length), 
+    'pile': preprocess_the_pile_gen(tokenizer, max_length)
+}
+data_set = IterableDataset.from_generator(create_data_iter, gen_kwargs={
+    'paths': paths, 
+    'transform_dict': transform_dict,
+    'process_index': accelerator.process_index, 
+    'num_processes': accelerator.num_processes
+})
+train_loader = DataLoader(data_set, batch_size=train_batch_size, num_workers=1, 
+collate_fn=pretrain_collate_fn_gen(tokenizer, max_length), drop_last=True)
+# smaller initializer_range make training more stable
+# add stabel embedding to token embedding
+raw_model = LlamaForCausalLM(LlamaConfig(vocab_size=tokenizer.vocab_size, 
+                                         initializer_range=initializer_range, 
+                                         pad_token_id=tokenizer.pad_id, 
+                                         rms_norm_eps=1e-5, 
+                                         hidden_dropout_prob=0.1, 
+                                         attention_dropout_prob=0.1, 
+                                         use_stable_embedding=True, 
+                                         shared_input_output_embedding=True))
+raw_model.eval()
+with torch.no_grad():
+    summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
+no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
+optimizer_grouped_parameters = [
+    {
+        "params": [p for n, p in raw_model.named_parameters() if not any(nd in n for nd in no_decay)],
+        "weight_decay": weight_decay,
+    },
+    {
+        "params": [p for n, p in raw_model.named_parameters() if any(nd in n for nd in no_decay)],
+        "weight_decay": 0.0,
+    },
+]
+optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
+optim.zero_grad()
+factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
+scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps * factor, 
+                                            num_training_steps=num_training_steps * factor)
+
+_, model, optim, scheduler = accelerator.prepare(
+    train_loader, raw_model, optim, scheduler
+)
+print('start training...')
+train_loader_iter = iter(train_loader)
+global_step = 0
+start_time = time.time()
+for data_step in range(num_training_steps):
+    model.train()
+    with accelerator.accumulate(model):
+        batch = next(train_loader_iter)
+        for k, v in batch.items():
+            batch[k] = v.to(accelerator.device)
+        labels = batch['input_ids'].clone()
+        labels[labels==tokenizer.pad_id] = -100
+        out = model(**batch, labels=labels)
+        total_loss = out.loss
+        losses = {
+            'total_loss': total_loss
+        }
+        accelerator.backward(total_loss)
+        optim.step()
+        scheduler.step()
+        optim.zero_grad()
+        if accelerator.sync_gradients:
+            global_step += 1
+    if data_step % log_interval == 0 and accelerator.is_main_process:
+        cost_time = time.time() - start_time
+        start_time = time.time()
+        tokens = train_batch_size * log_interval * max_length
+        wandb.log({'Training/Token per second per gpu': tokens/cost_time})
+        for k, v in losses.items():
+            wandb.log({'Losses/{}'.format(k): v})
+        current_lr = optim.param_groups[0]['lr']
+        wandb.log({'Training/LR': current_lr})
+        if optim.scaler is not None:
+            wandb.log({'Training/Loss Scale': optim.scaler.get_scale()})
+        wandb.log({'Training/Data Step': data_step})
+        wandb.log({'Training/Global Step': global_step})
+        accelerator.print('Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}'.format(
+            global_step, data_step, losses['total_loss'], tokens/cost_time))
+    if data_step % eval_interval == 0 and accelerator.is_main_process:
+        text_table = wandb.Table(columns=['question', 'pred'])
+        model.eval()
+        with torch.no_grad():
+            for data in val_set:
+                raw_inputs = data
+                inputs_len = len(raw_inputs)
+                inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False)
+                for k, v in inputs.items():
+                    inputs[k] = v.to(accelerator.device)
+                pred = model.generate(**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0)
+                pred = tokenizer.decode(pred.cpu())[0]
+                pred = pred[inputs_len:]
+                text_table.add_data(raw_inputs, pred)
+        wandb.log({'Predictions on {}'.format(global_step) : text_table})
+    if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
+        if not os.path.isdir(work_dir):
+            os.mkdir(work_dir)
+        torch.save(raw_model.state_dict(), '{}/{}.pt'.format(work_dir, global_step))
+wandb.finish()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,19 @@
+torch==1.13.1
+torchvision
+torchaudio
+zstandard
+accelerate
+datasets
+wandb
+deepspeed
+absl-py
+torchinfo
+scikit-learn
+datasets==2.10.1
+matplotlib
+seaborn
+sentencepiece
+triton
+functorch==1.13.1
+xformers
+git+https://github.com/Bayes-Song/transformers.git