From 4a1e7bb44b607c58addfc4df8281a993ba56460b Mon Sep 17 00:00:00 2001
From: LiangSong <sl12160010@gmail.com>
Date: Sat, 6 May 2023 23:37:17 +0800
Subject: [PATCH] Optimized the structure of configs, added support for
 deepspeed stage3, reduced memory usage by using Auto class to load models,
 and added support for training 65B models.

---
 README.md                                     |   8 ++--
 README_zh.md                                  |   8 ++--
 chat_server.py                                |  10 ++---
 .../ds_stage1.yaml}                           |   0
 .../accelerate_configs/ds_stage3_offload.yaml |  18 ++++++++
 configs/instruct_config.yaml                  |  10 ++---
 configs/model_configs/13B.json                |  26 +++++++++++
 configs/model_configs/33B.json                |  26 +++++++++++
 configs/model_configs/65B.json                |  26 +++++++++++
 configs/model_configs/7B.json                 |  26 +++++++++++
 configs/pretrain_config.yaml                  |  12 ++---
 .../10w_vocab_wudao5_pile10.model             | Bin
 .../4w_cn_vocab_wudao15.model                 | Bin
 .../llama_tokenizer.model                     | Bin
 .../llama_tokenizer_extended.model            | Bin
 dataset/dataset.py                            |   6 +--
 solver/trainer.py                             |  12 ++---
 train_lm.py                                   |  41 ++++++++----------
 utils/convert_ckpt.py                         |  12 ++++-
 utils/merge_tokenizer.py                      |  12 +++--
 utils/train_tokenizer.py                      |   4 +-
 21 files changed, 190 insertions(+), 67 deletions(-)
 rename configs/{default_config.yaml => accelerate_configs/ds_stage1.yaml} (100%)
 create mode 100644 configs/accelerate_configs/ds_stage3_offload.yaml
 create mode 100644 configs/model_configs/13B.json
 create mode 100644 configs/model_configs/33B.json
 create mode 100644 configs/model_configs/65B.json
 create mode 100644 configs/model_configs/7B.json
 rename configs/{ => tokenizer_models}/10w_vocab_wudao5_pile10.model (100%)
 rename configs/{ => tokenizer_models}/4w_cn_vocab_wudao15.model (100%)
 rename configs/{ => tokenizer_models}/llama_tokenizer.model (100%)
 rename configs/{ => tokenizer_models}/llama_tokenizer_extended.model (100%)
diff --git a/README.md b/README.md
index 3bd0694..a8e0b8b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
  * @Author: LiangSong(sl12160010@gmail.com)
  * @Date: 2023-03-10 21:18:35
  * @LastEditors: LiangSong(sl12160010@gmail.com)
- * @LastEditTime: 2023-05-04 22:55:25
+ * @LastEditTime: 2023-05-06 23:33:11
  * @FilePath: /Open-Llama/README.md
  * @Description: 
  * 
@@ -211,7 +211,7 @@ Finally, we referenced [PALM](https://arxiv.org/abs/2204.02311) and employed Sha
 We use multi-GPU parallel training based on the Accelerate library, with the following start command:
 
 ```bash
-accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/pretrain_config.yaml
+accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/pretrain_config.yaml
 ```
 In some cases, you may need to specify the following parameters:
 
@@ -225,7 +225,7 @@ In some cases, you may need to specify the following parameters:
 
 We use [Wandb](https://wandb.ai/) for visualizing training. You need to modify the WANDB_API_KEY environment variable yourself.
 
-Among them, we use DeepSpeed stage1 to reduce memory usage. For Accelerate-related configurations, see configs/default_config.yaml.
+Among them, we use DeepSpeed stage1 to reduce memory usage. For Accelerate-related configurations, see configs/accelerate_configs.
 
 Training related hyperparameters can be found in configs/pretrain_config.yaml.
 
@@ -279,7 +279,7 @@ user: {prompt}\nsystem: {completion}</s>
 The startup command is basically the same as pre-training:
 
 ```bash
-accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/instruct_config.yaml
+accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/instruct_config.yaml
 ```
 
 In some cases, you may need to specify the following parameters:
diff --git a/README_zh.md b/README_zh.md
index c7145b6..cbee8e2 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -2,7 +2,7 @@
  * @Author: LiangSong(sl12160010@gmail.com)
  * @Date: 2023-03-10 21:18:35
  * @LastEditors: LiangSong(sl12160010@gmail.com)
- * @LastEditTime: 2023-05-04 22:55:32
+ * @LastEditTime: 2023-05-06 23:32:31
  * @FilePath: /Open-Llama/README_zh.md
  * @Description: 
  * 
@@ -201,7 +201,7 @@ Self Attention的计算，这对于性能有明显的提升，提升大约30%。
 ### 预训练
 我们基于Accelerate库进行多GPU并行训练，启动命令如下
 ```bash
-accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/pretrain_config.yaml
+accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/pretrain_config.yaml
 ```
 某些情况下可能需要指定下列参数
 ```
@@ -213,7 +213,7 @@ accelerate launch --config_file configs/default_config.yaml train_lm.py --config
 ```
 我们使用[Wandb](https://wandb.ai/)进行训练的可视化，需要自行修改环境变量 WANDB_API_KEY 。
 
-其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/default_config.yaml。
+其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/accelerate_configs。
 
 训练相关超参数可见configs/pretrain_config.yaml
 
@@ -263,7 +263,7 @@ user: {prompt}\nsystem: {completion}</s>
 
 启动命令和预训练基本一致
 ```bash
-accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/instruct_config.yaml
+accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/instruct_config.yaml
 ```
 某些情况下可能需要指定下列参数
 ```
diff --git a/chat_server.py b/chat_server.py
index c5ab835..4cac42a 100644
--- a/chat_server.py
+++ b/chat_server.py
@@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-04-06 22:30:10
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-05-04 22:44:58
+LastEditTime: 2023-05-06 23:30:57
 FilePath: /Open-Llama/chat_server.py
 Description: 
 
@@ -15,7 +15,7 @@ from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer
 
 
 tokenizer = LlamaTokenizer(
-    "configs/10w_vocab_wudao5_pile10.model",
+    "configs/tokenizer_models/10w_vocab_wudao5_pile10.model",
     pad_token="<pad>",
     add_bos_token=False,
     add_eos_token=True,
@@ -42,7 +42,7 @@ if "module" in ckpt:
 raw_model.load_state_dict(ckpt)
 raw_model.eval()
 model = raw_model.half().cuda()
-logging.warn("ready")
+logging.warning("ready")
 
 
 with gr.Blocks() as demo:
@@ -59,7 +59,7 @@ with gr.Blocks() as demo:
     clear = gr.Button("Clear")
 
     def user(user_message, history):
-        logging.warn(user_message)
+        logging.warning(user_message)
         return "", history + [[user_message, None]]
 
     def bot(history):
@@ -92,7 +92,7 @@ with gr.Blocks() as demo:
         pred = model.generate(input_ids=context, max_new_tokens=1024, do_sample=True)
         pred = pred[:, inputs_len:]
         pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
-        logging.warn(pred)
+        logging.warning(pred)
         bot_message = pred
         history[-1][1] = bot_message
         return history
diff --git a/configs/default_config.yaml b/configs/accelerate_configs/ds_stage1.yaml
similarity index 100%
rename from configs/default_config.yaml
rename to configs/accelerate_configs/ds_stage1.yaml
diff --git a/configs/accelerate_configs/ds_stage3_offload.yaml b/configs/accelerate_configs/ds_stage3_offload.yaml
new file mode 100644
index 0000000..d7d3cc7
--- /dev/null
+++ b/configs/accelerate_configs/ds_stage3_offload.yaml
@@ -0,0 +1,18 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+use_cpu: false
\ No newline at end of file
diff --git a/configs/instruct_config.yaml b/configs/instruct_config.yaml
index f790f3b..a01a7fc 100644
--- a/configs/instruct_config.yaml
+++ b/configs/instruct_config.yaml
@@ -7,13 +7,8 @@ data:
   concat_multiple_sequence: True
   num_sequences: 50
   seq_length: 2048
-  tokenizer_model_path: "configs/llama_tokenizer_extended.model"
-model:
-  initializer_range: 1.0e-2
-  hidden_dropout_prob: 0.1
-  attention_dropout_prob: 0.1
-  use_stable_embedding: False
-  shared_input_output_embedding: False
+  tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
+  split_by_shard: False
 train:
   train_batch_size: 2
   # 1B token for 1 epoch, 5epoch
@@ -27,6 +22,7 @@ train:
   gradient_accumulation_steps: 1
   prefetch_factor: 100
   train_and_eval: False
+  gradient_checkpointing_enable: False
 # global step
 log_interval: 50
 eval_interval: 500
diff --git a/configs/model_configs/13B.json b/configs/model_configs/13B.json
new file mode 100644
index 0000000..f8682ee
--- /dev/null
+++ b/configs/model_configs/13B.json
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "OpenLlamaForCausalLM"
+  ],
+  "attention_dropout_prob": 0.1,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 5120,
+  "initializer_range": 1e-2,
+  "intermediate_size": 13824,
+  "max_position_embeddings": 2048,
+  "model_type": "open-llama",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "pad_token_id": 32000,
+  "rms_norm_eps": 1e-05,
+  "shared_input_output_embedding": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "use_cache": true,
+  "use_memorry_efficient_attention": true,
+  "use_stable_embedding": false,
+  "vocab_size": 68762
+}
diff --git a/configs/model_configs/33B.json b/configs/model_configs/33B.json
new file mode 100644
index 0000000..03902f9
--- /dev/null
+++ b/configs/model_configs/33B.json
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "OpenLlamaForCausalLM"
+  ],
+  "attention_dropout_prob": 0.1,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 6656,
+  "initializer_range": 1e-2,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 2048,
+  "model_type": "open-llama",
+  "num_attention_heads": 52,
+  "num_hidden_layers": 60,
+  "pad_token_id": 32000,
+  "rms_norm_eps": 1e-05,
+  "shared_input_output_embedding": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "use_cache": true,
+  "use_memorry_efficient_attention": true,
+  "use_stable_embedding": false,
+  "vocab_size": 68762
+}
diff --git a/configs/model_configs/65B.json b/configs/model_configs/65B.json
new file mode 100644
index 0000000..a075389
--- /dev/null
+++ b/configs/model_configs/65B.json
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "OpenLlamaForCausalLM"
+  ],
+  "attention_dropout_prob": 0.1,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 8192,
+  "initializer_range": 1e-2,
+  "intermediate_size": 22016,
+  "max_position_embeddings": 2048,
+  "model_type": "open-llama",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 80,
+  "pad_token_id": 32000,
+  "rms_norm_eps": 1e-05,
+  "shared_input_output_embedding": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "use_cache": true,
+  "use_memorry_efficient_attention": true,
+  "use_stable_embedding": false,
+  "vocab_size": 68762
+}
diff --git a/configs/model_configs/7B.json b/configs/model_configs/7B.json
new file mode 100644
index 0000000..d354dd6
--- /dev/null
+++ b/configs/model_configs/7B.json
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "OpenLlamaForCausalLM"
+  ],
+  "attention_dropout_prob": 0.1,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 4096,
+  "initializer_range": 1e-2,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "model_type": "open-llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "pad_token_id": 32000,
+  "rms_norm_eps": 1e-05,
+  "shared_input_output_embedding": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "use_cache": true,
+  "use_memorry_efficient_attention": true,
+  "use_stable_embedding": false,
+  "vocab_size": 68762
+}
diff --git a/configs/pretrain_config.yaml b/configs/pretrain_config.yaml
index d57a147..5d0ae96 100644
--- a/configs/pretrain_config.yaml
+++ b/configs/pretrain_config.yaml
@@ -9,13 +9,8 @@ data:
   concat_multiple_sequence: True
   num_sequences: 10
   seq_length: 2048
-  tokenizer_model_path: "configs/llama_tokenizer_extended.model"
-model:
-  initializer_range: 1.0e-2
-  hidden_dropout_prob: 0.1
-  attention_dropout_prob: 0.1
-  use_stable_embedding: False
-  shared_input_output_embedding: False
+  tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
+  split_by_shard: False
 train:
   train_batch_size: 2
   num_training_steps: 500000
@@ -28,7 +23,8 @@ train:
   train_num_workers: 16
   gradient_accumulation_steps: 12
   prefetch_factor: 100
-  train_and_eval: True
+  train_and_eval: False
+  gradient_checkpointing_enable: True
 # global step
 log_interval: 5
 eval_interval: 500
diff --git a/configs/10w_vocab_wudao5_pile10.model b/configs/tokenizer_models/10w_vocab_wudao5_pile10.model
similarity index 100%
rename from configs/10w_vocab_wudao5_pile10.model
rename to configs/tokenizer_models/10w_vocab_wudao5_pile10.model
diff --git a/configs/4w_cn_vocab_wudao15.model b/configs/tokenizer_models/4w_cn_vocab_wudao15.model
similarity index 100%
rename from configs/4w_cn_vocab_wudao15.model
rename to configs/tokenizer_models/4w_cn_vocab_wudao15.model
diff --git a/configs/llama_tokenizer.model b/configs/tokenizer_models/llama_tokenizer.model
similarity index 100%
rename from configs/llama_tokenizer.model
rename to configs/tokenizer_models/llama_tokenizer.model
diff --git a/configs/llama_tokenizer_extended.model b/configs/tokenizer_models/llama_tokenizer_extended.model
similarity index 100%
rename from configs/llama_tokenizer_extended.model
rename to configs/tokenizer_models/llama_tokenizer_extended.model
diff --git a/dataset/dataset.py b/dataset/dataset.py
index 3e8e09b..834bdea 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-04-24 20:05:21
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-05-04 09:17:21
+LastEditTime: 2023-05-06 23:30:37
 FilePath: /Open-Llama/dataset/dataset.py
 Description: 
 
@@ -105,7 +105,7 @@ def instruct_transform(batch):
         targets = batch["message_2"][0]
         text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip())
         texts = [text]
-     # grade-school-math-instructions preprocess
+    # grade-school-math-instructions preprocess
     elif "INSTRUCTION" in batch and "RESPONSE" in batch:
         inputs = batch["INSTRUCTION"][0]
         targets = batch["RESPONSE"][0]
@@ -291,7 +291,7 @@ if __name__ == "__main__":
         "seq_length": 2048,
     }
     tokenizer = LlamaTokenizer(
-        "configs/llama_tokenizer_extended.model",
+        "configs/tokenizer_models/llama_tokenizer_extended.model",
         pad_token="<pad>",
         add_bos_token=False,
         add_eos_token=True,
diff --git a/solver/trainer.py b/solver/trainer.py
index f51fd2e..6974a58 100644
--- a/solver/trainer.py
+++ b/solver/trainer.py
@@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-04-24 20:05:21
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-05-06 09:45:30
+LastEditTime: 2023-05-06 23:04:14
 FilePath: /Open-Llama/solver/trainer.py
 Description: 
 
@@ -26,7 +26,7 @@ class Trainer:
         self.train_loader = train_loader
         self.tokenizer = tokenizer
         self.accelerator = accelerator
-        self.train_and_eval = config.get("train_and_eval", False)
+        self.train_and_eval = config["train"].get("train_and_eval", False)
         self.gradient_accumulation_steps = config["train"].get(
             "gradient_accumulation_steps", 1
         )
@@ -43,7 +43,7 @@ class Trainer:
             self.config["save_interval"] * accelerator.gradient_accumulation_steps
         )
         self.work_dir = self.config["work_dir"]
-        self.get_model_info()
+        # self.get_model_info()
         if accelerator.is_main_process:
             wandb.init(project=self.config["project_name"])
 
@@ -104,12 +104,12 @@ class Trainer:
             self.accelerator.load_state(self.work_dir)
             self.global_step = self.scheduler.scheduler._step_count - 1
             self.global_step = self.global_step // self.accelerator.num_processes
-            logging.warn("Restored ckpt from {}".format(self.work_dir))
+            logging.warning("Restored ckpt from {}".format(self.work_dir))
         except:
-            logging.warn("No ckpt found in {}".format(self.work_dir))
+            logging.warning("No ckpt found in {}".format(self.work_dir))
         if self.global_step > 0:
             skip_steps = self.global_step * self.gradient_accumulation_steps
-            logging.warn("Skiped {} steps.".format(skip_steps))
+            logging.warning("Skiped {} steps.".format(skip_steps))
             self.train_loader_skiped = self.accelerator.skip_first_batches(
                 self.train_loader, num_batches=skip_steps
             )
diff --git a/train_lm.py b/train_lm.py
index e5ad3b8..ebf5ad8 100644
--- a/train_lm.py
+++ b/train_lm.py
@@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-04-12 19:12:42
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-05-04 09:19:15
+LastEditTime: 2023-05-06 23:08:42
 FilePath: /Open-Llama/train_lm.py
 Description: 
 
@@ -16,17 +16,20 @@ from absl import flags
 from accelerate import Accelerator
 from torch.utils.data import DataLoader
 from datasets.distributed import split_dataset_by_node
-from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, LlamaTokenizer
 
 from dataset.dataset import construct_dataset
 from solver.trainer import Trainer
 
 FLAGS = flags.FLAGS
-flags.DEFINE_string("config", None, "Training config path")
+flags.DEFINE_string("train_config", None, "Training config path")
+flags.DEFINE_string(
+    "model_config", "configs/model_configs/7B.json", "Model config path"
+)
 
 
 def main(argv):
-    with open(FLAGS.config, "r", encoding="utf-8") as fp:
+    with open(FLAGS.train_config, "r", encoding="utf-8") as fp:
         config = yaml.load(fp, Loader=yaml.FullLoader)
 
     accelerator = Accelerator(
@@ -61,26 +64,18 @@ def main(argv):
     )
     # smaller initializer_range make training more stable
     # add stabel embedding to token embedding
-    raw_model = OpenLlamaForCausalLM(
-        OpenLlamaConfig(
-            vocab_size=tokenizer.vocab_size,
-            initializer_range=config["model"]["initializer_range"],
-            pad_token_id=tokenizer.pad_token_id,
-            rms_norm_eps=1e-5,
-            hidden_dropout_prob=config["model"]["hidden_dropout_prob"],
-            attention_dropout_prob=config["model"]["attention_dropout_prob"],
-            use_stable_embedding=config["model"]["use_stable_embedding"],
-            shared_input_output_embedding=config["model"][
-                "shared_input_output_embedding"
-            ],
-        )
-    )
+    model_config = AutoConfig.from_pretrained(FLAGS.model_config)
+    model_config.vocab_size = tokenizer.vocab_size
+    model_config.pad_token_id = tokenizer.pad_token_id
     if config["train"]["ckpt"] is not None:
-        ckpt = torch.load(config["train"]["ckpt"], map_location="cpu")
-        if "module" in ckpt:
-            ckpt = ckpt["module"]
-        raw_model.load_state_dict(ckpt)
-        logging.warn("Loaded ckpt from: {}".format(config["train"]["ckpt"]))
+        raw_model = AutoModelForCausalLM.from_pretrained(
+            config["train"]["ckpt"], config=model_config
+        )
+        logging.warning("Loaded ckpt from: {}".format(config["train"]["ckpt"]))
+    else:
+        raw_model = AutoModelForCausalLM.from_config(model_config)
+    if config["train"].get("gradient_checkpointing_enable", False):
+        raw_model.gradient_checkpointing_enable()
     trainer = Trainer(config, raw_model, train_loader, tokenizer, accelerator)
     trainer.train()
 
diff --git a/utils/convert_ckpt.py b/utils/convert_ckpt.py
index 46307ce..8a8f972 100644
--- a/utils/convert_ckpt.py
+++ b/utils/convert_ckpt.py
@@ -1,9 +1,19 @@
+"""
+Author: LiangSong(sl12160010@gmail.com)
+Date: 2023-04-28 19:55:13
+LastEditors: LiangSong(sl12160010@gmail.com)
+LastEditTime: 2023-05-06 23:30:29
+FilePath: /Open-Llama/utils/convert_ckpt.py
+Description: 
+
+Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
+"""
 import torch
 import sentencepiece as spm
 
 
 sp_model = spm.SentencePieceProcessor(
-    model_file="configs/llama_tokenizer_extended.model"
+    model_file="configs/tokenizer_models/llama_tokenizer_extended.model"
 )
 merged_vocab_size = sp_model.vocab_size()
 ckpt = torch.load("data/llama_raw_ckpt/7B/consolidated.00.pth")
diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py
index c26bc12..faa5fb5 100644
--- a/utils/merge_tokenizer.py
+++ b/utils/merge_tokenizer.py
@@ -3,21 +3,25 @@ import sentencepiece as spm
 from sentencepiece import sentencepiece_model_pb2 as model
 
 raw_model = model.ModelProto()
-raw_model.ParseFromString(open("configs/llama_tokenizer.model", "rb").read())
+raw_model.ParseFromString(
+    open("configs/tokenizer_models/llama_tokenizer.model", "rb").read()
+)
 
 exist_pieces = set([p.piece for p in raw_model.pieces])
 cn_model = model.ModelProto()
-cn_model.ParseFromString(open("configs/4w_cn_vocab_wudao15.model", "rb").read())
+cn_model.ParseFromString(
+    open("configs/tokenizer_models/4w_cn_vocab_wudao15.model", "rb").read()
+)
 
 for p in tqdm(cn_model.pieces, total=len(cn_model.pieces)):
     if p.piece not in exist_pieces:
         raw_model.pieces.append(p)
 
-with open("configs/llama_tokenizer_extended.model", "wb") as f:
+with open("configs/tokenizer_models/llama_tokenizer_extended.model", "wb") as f:
     f.write(raw_model.SerializeToString())
 
 sp_model = spm.SentencePieceProcessor(
-    model_file="configs/llama_tokenizer_extended.model"
+    model_file="configs/tokenizer_models/llama_tokenizer_extended.model"
 )
 
 print("merged vocab size: {}".format(sp_model.vocab_size()))
diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py
index 2d02439..6dd1991 100644
--- a/utils/train_tokenizer.py
+++ b/utils/train_tokenizer.py
@@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-24 20:49:03
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-05-04 08:42:21
+LastEditTime: 2023-05-06 23:34:14
 FilePath: /Open-Llama/utils/train_tokenizer.py
 Description: 
 
@@ -67,7 +67,7 @@ spm.SentencePieceTrainer.train(
 )
 
 # Serialize the model as file.
-with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f:
+with open("configs/tokenizer_models/10w_vocab_wudao5_pile10.model", "wb") as f:
     f.write(model.getvalue())
 
 # Directly load the model from serialized model.