From 4a1e7bb44b607c58addfc4df8281a993ba56460b Mon Sep 17 00:00:00 2001 From: LiangSong Date: Sat, 6 May 2023 23:37:17 +0800 Subject: [PATCH] Optimized the structure of configs, added support for deepspeed stage3, reduced memory usage by using Auto class to load models, and added support for training 65B models. --- README.md | 8 ++-- README_zh.md | 8 ++-- chat_server.py | 10 ++--- .../ds_stage1.yaml} | 0 .../accelerate_configs/ds_stage3_offload.yaml | 18 ++++++++ configs/instruct_config.yaml | 10 ++--- configs/model_configs/13B.json | 26 +++++++++++ configs/model_configs/33B.json | 26 +++++++++++ configs/model_configs/65B.json | 26 +++++++++++ configs/model_configs/7B.json | 26 +++++++++++ configs/pretrain_config.yaml | 12 ++--- .../10w_vocab_wudao5_pile10.model | Bin .../4w_cn_vocab_wudao15.model | Bin .../llama_tokenizer.model | Bin .../llama_tokenizer_extended.model | Bin dataset/dataset.py | 6 +-- solver/trainer.py | 12 ++--- train_lm.py | 41 ++++++++---------- utils/convert_ckpt.py | 12 ++++- utils/merge_tokenizer.py | 12 +++-- utils/train_tokenizer.py | 4 +- 21 files changed, 190 insertions(+), 67 deletions(-) rename configs/{default_config.yaml => accelerate_configs/ds_stage1.yaml} (100%) create mode 100644 configs/accelerate_configs/ds_stage3_offload.yaml create mode 100644 configs/model_configs/13B.json create mode 100644 configs/model_configs/33B.json create mode 100644 configs/model_configs/65B.json create mode 100644 configs/model_configs/7B.json rename configs/{ => tokenizer_models}/10w_vocab_wudao5_pile10.model (100%) rename configs/{ => tokenizer_models}/4w_cn_vocab_wudao15.model (100%) rename configs/{ => tokenizer_models}/llama_tokenizer.model (100%) rename configs/{ => tokenizer_models}/llama_tokenizer_extended.model (100%) diff --git a/README.md b/README.md index 3bd0694..a8e0b8b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ * @Author: LiangSong(sl12160010@gmail.com) * @Date: 2023-03-10 21:18:35 * @LastEditors: LiangSong(sl12160010@gmail.com) - * @LastEditTime: 2023-05-04 22:55:25 + * @LastEditTime: 2023-05-06 23:33:11 * @FilePath: /Open-Llama/README.md * @Description: * @@ -211,7 +211,7 @@ Finally, we referenced [PALM](https://arxiv.org/abs/2204.02311) and employed Sha We use multi-GPU parallel training based on the Accelerate library, with the following start command: ```bash -accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/pretrain_config.yaml +accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/pretrain_config.yaml ``` In some cases, you may need to specify the following parameters: @@ -225,7 +225,7 @@ In some cases, you may need to specify the following parameters: We use [Wandb](https://wandb.ai/) for visualizing training. You need to modify the WANDB_API_KEY environment variable yourself. -Among them, we use DeepSpeed stage1 to reduce memory usage. For Accelerate-related configurations, see configs/default_config.yaml. +Among them, we use DeepSpeed stage1 to reduce memory usage. For Accelerate-related configurations, see configs/accelerate_configs. Training related hyperparameters can be found in configs/pretrain_config.yaml. @@ -279,7 +279,7 @@ user: {prompt}\nsystem: {completion} The startup command is basically the same as pre-training: ```bash -accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/instruct_config.yaml +accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/instruct_config.yaml ``` In some cases, you may need to specify the following parameters: diff --git a/README_zh.md b/README_zh.md index c7145b6..cbee8e2 100644 --- a/README_zh.md +++ b/README_zh.md @@ -2,7 +2,7 @@ * @Author: LiangSong(sl12160010@gmail.com) * @Date: 2023-03-10 21:18:35 * @LastEditors: LiangSong(sl12160010@gmail.com) - * @LastEditTime: 2023-05-04 22:55:32 + * @LastEditTime: 2023-05-06 23:32:31 * @FilePath: /Open-Llama/README_zh.md * @Description: * @@ -201,7 +201,7 @@ Self Attention的计算,这对于性能有明显的提升,提升大约30%。 ### 预训练 我们基于Accelerate库进行多GPU并行训练,启动命令如下 ```bash -accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/pretrain_config.yaml +accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/pretrain_config.yaml ``` 某些情况下可能需要指定下列参数 ``` @@ -213,7 +213,7 @@ accelerate launch --config_file configs/default_config.yaml train_lm.py --config ``` 我们使用[Wandb](https://wandb.ai/)进行训练的可视化,需要自行修改环境变量 WANDB_API_KEY 。 -其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/default_config.yaml。 +其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/accelerate_configs。 训练相关超参数可见configs/pretrain_config.yaml @@ -263,7 +263,7 @@ user: {prompt}\nsystem: {completion} 启动命令和预训练基本一致 ```bash -accelerate launch --config_file configs/default_config.yaml train_lm.py --config configs/instruct_config.yaml +accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --config configs/instruct_config.yaml ``` 某些情况下可能需要指定下列参数 ``` diff --git a/chat_server.py b/chat_server.py index c5ab835..4cac42a 100644 --- a/chat_server.py +++ b/chat_server.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-04-06 22:30:10 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-05-04 22:44:58 +LastEditTime: 2023-05-06 23:30:57 FilePath: /Open-Llama/chat_server.py Description: @@ -15,7 +15,7 @@ from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer tokenizer = LlamaTokenizer( - "configs/10w_vocab_wudao5_pile10.model", + "configs/tokenizer_models/10w_vocab_wudao5_pile10.model", pad_token="", add_bos_token=False, add_eos_token=True, @@ -42,7 +42,7 @@ if "module" in ckpt: raw_model.load_state_dict(ckpt) raw_model.eval() model = raw_model.half().cuda() -logging.warn("ready") +logging.warning("ready") with gr.Blocks() as demo: @@ -59,7 +59,7 @@ with gr.Blocks() as demo: clear = gr.Button("Clear") def user(user_message, history): - logging.warn(user_message) + logging.warning(user_message) return "", history + [[user_message, None]] def bot(history): @@ -92,7 +92,7 @@ with gr.Blocks() as demo: pred = model.generate(input_ids=context, max_new_tokens=1024, do_sample=True) pred = pred[:, inputs_len:] pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True) - logging.warn(pred) + logging.warning(pred) bot_message = pred history[-1][1] = bot_message return history diff --git a/configs/default_config.yaml b/configs/accelerate_configs/ds_stage1.yaml similarity index 100% rename from configs/default_config.yaml rename to configs/accelerate_configs/ds_stage1.yaml diff --git a/configs/accelerate_configs/ds_stage3_offload.yaml b/configs/accelerate_configs/ds_stage3_offload.yaml new file mode 100644 index 0000000..d7d3cc7 --- /dev/null +++ b/configs/accelerate_configs/ds_stage3_offload.yaml @@ -0,0 +1,18 @@ +compute_environment: LOCAL_MACHINE +deepspeed_config: + deepspeed_multinode_launcher: standard + gradient_clipping: 1.0 + offload_optimizer_device: cpu + offload_param_device: cpu + zero3_init_flag: true + zero_stage: 3 +distributed_type: DEEPSPEED +fsdp_config: {} +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +use_cpu: false \ No newline at end of file diff --git a/configs/instruct_config.yaml b/configs/instruct_config.yaml index f790f3b..a01a7fc 100644 --- a/configs/instruct_config.yaml +++ b/configs/instruct_config.yaml @@ -7,13 +7,8 @@ data: concat_multiple_sequence: True num_sequences: 50 seq_length: 2048 - tokenizer_model_path: "configs/llama_tokenizer_extended.model" -model: - initializer_range: 1.0e-2 - hidden_dropout_prob: 0.1 - attention_dropout_prob: 0.1 - use_stable_embedding: False - shared_input_output_embedding: False + tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model" + split_by_shard: False train: train_batch_size: 2 # 1B token for 1 epoch, 5epoch @@ -27,6 +22,7 @@ train: gradient_accumulation_steps: 1 prefetch_factor: 100 train_and_eval: False + gradient_checkpointing_enable: False # global step log_interval: 50 eval_interval: 500 diff --git a/configs/model_configs/13B.json b/configs/model_configs/13B.json new file mode 100644 index 0000000..f8682ee --- /dev/null +++ b/configs/model_configs/13B.json @@ -0,0 +1,26 @@ +{ + "architectures": [ + "OpenLlamaForCausalLM" + ], + "attention_dropout_prob": 0.1, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_dropout_prob": 0.1, + "hidden_size": 5120, + "initializer_range": 1e-2, + "intermediate_size": 13824, + "max_position_embeddings": 2048, + "model_type": "open-llama", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "pad_token_id": 32000, + "rms_norm_eps": 1e-05, + "shared_input_output_embedding": false, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "use_cache": true, + "use_memorry_efficient_attention": true, + "use_stable_embedding": false, + "vocab_size": 68762 +} diff --git a/configs/model_configs/33B.json b/configs/model_configs/33B.json new file mode 100644 index 0000000..03902f9 --- /dev/null +++ b/configs/model_configs/33B.json @@ -0,0 +1,26 @@ +{ + "architectures": [ + "OpenLlamaForCausalLM" + ], + "attention_dropout_prob": 0.1, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_dropout_prob": 0.1, + "hidden_size": 6656, + "initializer_range": 1e-2, + "intermediate_size": 17920, + "max_position_embeddings": 2048, + "model_type": "open-llama", + "num_attention_heads": 52, + "num_hidden_layers": 60, + "pad_token_id": 32000, + "rms_norm_eps": 1e-05, + "shared_input_output_embedding": false, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "use_cache": true, + "use_memorry_efficient_attention": true, + "use_stable_embedding": false, + "vocab_size": 68762 +} diff --git a/configs/model_configs/65B.json b/configs/model_configs/65B.json new file mode 100644 index 0000000..a075389 --- /dev/null +++ b/configs/model_configs/65B.json @@ -0,0 +1,26 @@ +{ + "architectures": [ + "OpenLlamaForCausalLM" + ], + "attention_dropout_prob": 0.1, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_dropout_prob": 0.1, + "hidden_size": 8192, + "initializer_range": 1e-2, + "intermediate_size": 22016, + "max_position_embeddings": 2048, + "model_type": "open-llama", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "pad_token_id": 32000, + "rms_norm_eps": 1e-05, + "shared_input_output_embedding": false, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "use_cache": true, + "use_memorry_efficient_attention": true, + "use_stable_embedding": false, + "vocab_size": 68762 +} diff --git a/configs/model_configs/7B.json b/configs/model_configs/7B.json new file mode 100644 index 0000000..d354dd6 --- /dev/null +++ b/configs/model_configs/7B.json @@ -0,0 +1,26 @@ +{ + "architectures": [ + "OpenLlamaForCausalLM" + ], + "attention_dropout_prob": 0.1, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_dropout_prob": 0.1, + "hidden_size": 4096, + "initializer_range": 1e-2, + "intermediate_size": 11008, + "max_position_embeddings": 2048, + "model_type": "open-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "pad_token_id": 32000, + "rms_norm_eps": 1e-05, + "shared_input_output_embedding": false, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "use_cache": true, + "use_memorry_efficient_attention": true, + "use_stable_embedding": false, + "vocab_size": 68762 +} diff --git a/configs/pretrain_config.yaml b/configs/pretrain_config.yaml index d57a147..5d0ae96 100644 --- a/configs/pretrain_config.yaml +++ b/configs/pretrain_config.yaml @@ -9,13 +9,8 @@ data: concat_multiple_sequence: True num_sequences: 10 seq_length: 2048 - tokenizer_model_path: "configs/llama_tokenizer_extended.model" -model: - initializer_range: 1.0e-2 - hidden_dropout_prob: 0.1 - attention_dropout_prob: 0.1 - use_stable_embedding: False - shared_input_output_embedding: False + tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model" + split_by_shard: False train: train_batch_size: 2 num_training_steps: 500000 @@ -28,7 +23,8 @@ train: train_num_workers: 16 gradient_accumulation_steps: 12 prefetch_factor: 100 - train_and_eval: True + train_and_eval: False + gradient_checkpointing_enable: True # global step log_interval: 5 eval_interval: 500 diff --git a/configs/10w_vocab_wudao5_pile10.model b/configs/tokenizer_models/10w_vocab_wudao5_pile10.model similarity index 100% rename from configs/10w_vocab_wudao5_pile10.model rename to configs/tokenizer_models/10w_vocab_wudao5_pile10.model diff --git a/configs/4w_cn_vocab_wudao15.model b/configs/tokenizer_models/4w_cn_vocab_wudao15.model similarity index 100% rename from configs/4w_cn_vocab_wudao15.model rename to configs/tokenizer_models/4w_cn_vocab_wudao15.model diff --git a/configs/llama_tokenizer.model b/configs/tokenizer_models/llama_tokenizer.model similarity index 100% rename from configs/llama_tokenizer.model rename to configs/tokenizer_models/llama_tokenizer.model diff --git a/configs/llama_tokenizer_extended.model b/configs/tokenizer_models/llama_tokenizer_extended.model similarity index 100% rename from configs/llama_tokenizer_extended.model rename to configs/tokenizer_models/llama_tokenizer_extended.model diff --git a/dataset/dataset.py b/dataset/dataset.py index 3e8e09b..834bdea 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-04-24 20:05:21 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-05-04 09:17:21 +LastEditTime: 2023-05-06 23:30:37 FilePath: /Open-Llama/dataset/dataset.py Description: @@ -105,7 +105,7 @@ def instruct_transform(batch): targets = batch["message_2"][0] text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip()) texts = [text] - # grade-school-math-instructions preprocess + # grade-school-math-instructions preprocess elif "INSTRUCTION" in batch and "RESPONSE" in batch: inputs = batch["INSTRUCTION"][0] targets = batch["RESPONSE"][0] @@ -291,7 +291,7 @@ if __name__ == "__main__": "seq_length": 2048, } tokenizer = LlamaTokenizer( - "configs/llama_tokenizer_extended.model", + "configs/tokenizer_models/llama_tokenizer_extended.model", pad_token="", add_bos_token=False, add_eos_token=True, diff --git a/solver/trainer.py b/solver/trainer.py index f51fd2e..6974a58 100644 --- a/solver/trainer.py +++ b/solver/trainer.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-04-24 20:05:21 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-05-06 09:45:30 +LastEditTime: 2023-05-06 23:04:14 FilePath: /Open-Llama/solver/trainer.py Description: @@ -26,7 +26,7 @@ class Trainer: self.train_loader = train_loader self.tokenizer = tokenizer self.accelerator = accelerator - self.train_and_eval = config.get("train_and_eval", False) + self.train_and_eval = config["train"].get("train_and_eval", False) self.gradient_accumulation_steps = config["train"].get( "gradient_accumulation_steps", 1 ) @@ -43,7 +43,7 @@ class Trainer: self.config["save_interval"] * accelerator.gradient_accumulation_steps ) self.work_dir = self.config["work_dir"] - self.get_model_info() + # self.get_model_info() if accelerator.is_main_process: wandb.init(project=self.config["project_name"]) @@ -104,12 +104,12 @@ class Trainer: self.accelerator.load_state(self.work_dir) self.global_step = self.scheduler.scheduler._step_count - 1 self.global_step = self.global_step // self.accelerator.num_processes - logging.warn("Restored ckpt from {}".format(self.work_dir)) + logging.warning("Restored ckpt from {}".format(self.work_dir)) except: - logging.warn("No ckpt found in {}".format(self.work_dir)) + logging.warning("No ckpt found in {}".format(self.work_dir)) if self.global_step > 0: skip_steps = self.global_step * self.gradient_accumulation_steps - logging.warn("Skiped {} steps.".format(skip_steps)) + logging.warning("Skiped {} steps.".format(skip_steps)) self.train_loader_skiped = self.accelerator.skip_first_batches( self.train_loader, num_batches=skip_steps ) diff --git a/train_lm.py b/train_lm.py index e5ad3b8..ebf5ad8 100644 --- a/train_lm.py +++ b/train_lm.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-04-12 19:12:42 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-05-04 09:19:15 +LastEditTime: 2023-05-06 23:08:42 FilePath: /Open-Llama/train_lm.py Description: @@ -16,17 +16,20 @@ from absl import flags from accelerate import Accelerator from torch.utils.data import DataLoader from datasets.distributed import split_dataset_by_node -from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer +from transformers import AutoConfig, AutoModelForCausalLM, LlamaTokenizer from dataset.dataset import construct_dataset from solver.trainer import Trainer FLAGS = flags.FLAGS -flags.DEFINE_string("config", None, "Training config path") +flags.DEFINE_string("train_config", None, "Training config path") +flags.DEFINE_string( + "model_config", "configs/model_configs/7B.json", "Model config path" +) def main(argv): - with open(FLAGS.config, "r", encoding="utf-8") as fp: + with open(FLAGS.train_config, "r", encoding="utf-8") as fp: config = yaml.load(fp, Loader=yaml.FullLoader) accelerator = Accelerator( @@ -61,26 +64,18 @@ def main(argv): ) # smaller initializer_range make training more stable # add stabel embedding to token embedding - raw_model = OpenLlamaForCausalLM( - OpenLlamaConfig( - vocab_size=tokenizer.vocab_size, - initializer_range=config["model"]["initializer_range"], - pad_token_id=tokenizer.pad_token_id, - rms_norm_eps=1e-5, - hidden_dropout_prob=config["model"]["hidden_dropout_prob"], - attention_dropout_prob=config["model"]["attention_dropout_prob"], - use_stable_embedding=config["model"]["use_stable_embedding"], - shared_input_output_embedding=config["model"][ - "shared_input_output_embedding" - ], - ) - ) + model_config = AutoConfig.from_pretrained(FLAGS.model_config) + model_config.vocab_size = tokenizer.vocab_size + model_config.pad_token_id = tokenizer.pad_token_id if config["train"]["ckpt"] is not None: - ckpt = torch.load(config["train"]["ckpt"], map_location="cpu") - if "module" in ckpt: - ckpt = ckpt["module"] - raw_model.load_state_dict(ckpt) - logging.warn("Loaded ckpt from: {}".format(config["train"]["ckpt"])) + raw_model = AutoModelForCausalLM.from_pretrained( + config["train"]["ckpt"], config=model_config + ) + logging.warning("Loaded ckpt from: {}".format(config["train"]["ckpt"])) + else: + raw_model = AutoModelForCausalLM.from_config(model_config) + if config["train"].get("gradient_checkpointing_enable", False): + raw_model.gradient_checkpointing_enable() trainer = Trainer(config, raw_model, train_loader, tokenizer, accelerator) trainer.train() diff --git a/utils/convert_ckpt.py b/utils/convert_ckpt.py index 46307ce..8a8f972 100644 --- a/utils/convert_ckpt.py +++ b/utils/convert_ckpt.py @@ -1,9 +1,19 @@ +""" +Author: LiangSong(sl12160010@gmail.com) +Date: 2023-04-28 19:55:13 +LastEditors: LiangSong(sl12160010@gmail.com) +LastEditTime: 2023-05-06 23:30:29 +FilePath: /Open-Llama/utils/convert_ckpt.py +Description: + +Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. +""" import torch import sentencepiece as spm sp_model = spm.SentencePieceProcessor( - model_file="configs/llama_tokenizer_extended.model" + model_file="configs/tokenizer_models/llama_tokenizer_extended.model" ) merged_vocab_size = sp_model.vocab_size() ckpt = torch.load("data/llama_raw_ckpt/7B/consolidated.00.pth") diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py index c26bc12..faa5fb5 100644 --- a/utils/merge_tokenizer.py +++ b/utils/merge_tokenizer.py @@ -3,21 +3,25 @@ import sentencepiece as spm from sentencepiece import sentencepiece_model_pb2 as model raw_model = model.ModelProto() -raw_model.ParseFromString(open("configs/llama_tokenizer.model", "rb").read()) +raw_model.ParseFromString( + open("configs/tokenizer_models/llama_tokenizer.model", "rb").read() +) exist_pieces = set([p.piece for p in raw_model.pieces]) cn_model = model.ModelProto() -cn_model.ParseFromString(open("configs/4w_cn_vocab_wudao15.model", "rb").read()) +cn_model.ParseFromString( + open("configs/tokenizer_models/4w_cn_vocab_wudao15.model", "rb").read() +) for p in tqdm(cn_model.pieces, total=len(cn_model.pieces)): if p.piece not in exist_pieces: raw_model.pieces.append(p) -with open("configs/llama_tokenizer_extended.model", "wb") as f: +with open("configs/tokenizer_models/llama_tokenizer_extended.model", "wb") as f: f.write(raw_model.SerializeToString()) sp_model = spm.SentencePieceProcessor( - model_file="configs/llama_tokenizer_extended.model" + model_file="configs/tokenizer_models/llama_tokenizer_extended.model" ) print("merged vocab size: {}".format(sp_model.vocab_size())) diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py index 2d02439..6dd1991 100644 --- a/utils/train_tokenizer.py +++ b/utils/train_tokenizer.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-24 20:49:03 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-05-04 08:42:21 +LastEditTime: 2023-05-06 23:34:14 FilePath: /Open-Llama/utils/train_tokenizer.py Description: @@ -67,7 +67,7 @@ spm.SentencePieceTrainer.train( ) # Serialize the model as file. -with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f: +with open("configs/tokenizer_models/10w_vocab_wudao5_pile10.model", "wb") as f: f.write(model.getvalue()) # Directly load the model from serialized model.