2023-04-12 14:16:15 +00:00
|
|
|
|
"""
|
2023-05-17 15:21:46 +00:00
|
|
|
|
Author: s-JoL(sl12160010@gmail.com)
|
2023-04-12 14:16:15 +00:00
|
|
|
|
Date: 2023-04-12 19:12:42
|
2023-05-17 15:21:46 +00:00
|
|
|
|
LastEditors: s-JoL(sl12160010@gmail.com)
|
|
|
|
|
LastEditTime: 2023-05-17 22:20:32
|
2023-04-27 15:42:11 +00:00
|
|
|
|
FilePath: /Open-Llama/train_lm.py
|
2023-04-12 14:16:15 +00:00
|
|
|
|
Description:
|
|
|
|
|
|
2023-05-17 15:21:46 +00:00
|
|
|
|
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
|
2023-04-12 14:16:15 +00:00
|
|
|
|
"""
|
2023-04-12 09:59:05 +00:00
|
|
|
|
import yaml
|
2023-05-10 09:49:52 +00:00
|
|
|
|
import math
|
2023-04-29 12:28:39 +00:00
|
|
|
|
import logging
|
2023-04-12 09:59:05 +00:00
|
|
|
|
from absl import app
|
|
|
|
|
from absl import flags
|
|
|
|
|
from accelerate import Accelerator
|
|
|
|
|
from torch.utils.data import DataLoader
|
2023-05-08 14:26:39 +00:00
|
|
|
|
from peft import LoraConfig, TaskType, get_peft_model
|
2023-04-26 16:04:11 +00:00
|
|
|
|
from datasets.distributed import split_dataset_by_node
|
2023-05-06 15:37:17 +00:00
|
|
|
|
from transformers import AutoConfig, AutoModelForCausalLM, LlamaTokenizer
|
2023-04-12 09:59:05 +00:00
|
|
|
|
|
2023-04-24 11:13:53 +00:00
|
|
|
|
from dataset.dataset import construct_dataset
|
2023-04-12 09:59:05 +00:00
|
|
|
|
from solver.trainer import Trainer
|
|
|
|
|
|
|
|
|
|
FLAGS = flags.FLAGS
|
2023-05-06 15:37:17 +00:00
|
|
|
|
flags.DEFINE_string("train_config", None, "Training config path")
|
|
|
|
|
flags.DEFINE_string(
|
|
|
|
|
"model_config", "configs/model_configs/7B.json", "Model config path"
|
|
|
|
|
)
|
2023-04-12 09:59:05 +00:00
|
|
|
|
|
2023-04-12 14:16:15 +00:00
|
|
|
|
|
2023-04-12 09:59:05 +00:00
|
|
|
|
def main(argv):
|
2023-05-06 15:37:17 +00:00
|
|
|
|
with open(FLAGS.train_config, "r", encoding="utf-8") as fp:
|
2023-04-12 09:59:05 +00:00
|
|
|
|
config = yaml.load(fp, Loader=yaml.FullLoader)
|
2023-04-27 11:42:06 +00:00
|
|
|
|
|
|
|
|
|
accelerator = Accelerator(
|
|
|
|
|
gradient_accumulation_steps=config["train"].get(
|
|
|
|
|
"gradient_accumulation_steps", 1
|
|
|
|
|
)
|
|
|
|
|
)
|
2023-04-26 10:53:30 +00:00
|
|
|
|
tokenizer = LlamaTokenizer(
|
2023-04-24 11:13:53 +00:00
|
|
|
|
config["data"]["tokenizer_model_path"],
|
|
|
|
|
pad_token="<pad>",
|
|
|
|
|
add_bos_token=False,
|
|
|
|
|
add_eos_token=True,
|
2023-04-12 14:16:15 +00:00
|
|
|
|
)
|
2023-04-24 11:13:53 +00:00
|
|
|
|
data_config = config["data"]
|
2023-05-04 01:20:23 +00:00
|
|
|
|
if data_config.get("split_by_shard", False):
|
|
|
|
|
train_dataset = construct_dataset(
|
|
|
|
|
data_config, tokenizer, world_size=accelerator.num_processes
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
train_dataset = construct_dataset(data_config, tokenizer)
|
2023-04-27 11:42:06 +00:00
|
|
|
|
train_dataset = split_dataset_by_node(
|
|
|
|
|
train_dataset,
|
|
|
|
|
rank=accelerator.process_index,
|
|
|
|
|
world_size=accelerator.num_processes,
|
2023-04-26 16:04:11 +00:00
|
|
|
|
)
|
2023-04-24 11:13:53 +00:00
|
|
|
|
train_loader = DataLoader(
|
2023-04-27 11:42:06 +00:00
|
|
|
|
train_dataset,
|
2023-04-24 11:13:53 +00:00
|
|
|
|
batch_size=config["train"]["train_batch_size"],
|
|
|
|
|
num_workers=config["train"]["train_num_workers"],
|
2023-04-27 15:42:11 +00:00
|
|
|
|
prefetch_factor=config["train"].get("prefetch_factor", 2),
|
|
|
|
|
pin_memory=True,
|
2023-04-12 09:59:05 +00:00
|
|
|
|
)
|
|
|
|
|
# smaller initializer_range make training more stable
|
|
|
|
|
# add stabel embedding to token embedding
|
2023-05-06 15:37:17 +00:00
|
|
|
|
model_config = AutoConfig.from_pretrained(FLAGS.model_config)
|
2023-05-10 09:49:52 +00:00
|
|
|
|
# Make the vocab size divisible by 16
|
|
|
|
|
# https://huggingface.co/docs/transformers/main_classes/deepspeed#how-to-choose-which-zero-stage-and-offloads-to-use-for-best-performance
|
|
|
|
|
# https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/
|
2023-05-11 06:15:12 +00:00
|
|
|
|
# vocab_size = math.ceil(tokenizer.vocab_size / 16) * 16
|
|
|
|
|
# logging.warning(
|
|
|
|
|
# "Round vocab_size from {} to {}.".format(tokenizer.vocab_size, vocab_size)
|
|
|
|
|
# )
|
|
|
|
|
vocab_size = tokenizer.vocab_size
|
2023-05-10 09:49:52 +00:00
|
|
|
|
model_config.vocab_size = vocab_size
|
2023-05-06 15:37:17 +00:00
|
|
|
|
model_config.pad_token_id = tokenizer.pad_token_id
|
2023-05-09 08:53:05 +00:00
|
|
|
|
# 使用AutoModel可以在Deepspeed.zero.Init()下正确的生效,而直接使用如OpenLlamaModel不能正确生效,导致浪费大量内存空间
|
|
|
|
|
# https://github.com/huggingface/accelerate/pull/932
|
2023-04-12 14:16:15 +00:00
|
|
|
|
if config["train"]["ckpt"] is not None:
|
2023-05-06 15:37:17 +00:00
|
|
|
|
raw_model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
config["train"]["ckpt"], config=model_config
|
|
|
|
|
)
|
|
|
|
|
logging.warning("Loaded ckpt from: {}".format(config["train"]["ckpt"]))
|
|
|
|
|
else:
|
|
|
|
|
raw_model = AutoModelForCausalLM.from_config(model_config)
|
2023-05-08 14:26:39 +00:00
|
|
|
|
# lora
|
|
|
|
|
if config["train"].get("use_lora", False):
|
2023-05-08 15:40:03 +00:00
|
|
|
|
# gradient ckpt bug, https://github.com/huggingface/transformers/issues/23170
|
|
|
|
|
if hasattr(raw_model, "enable_input_require_grads"):
|
|
|
|
|
raw_model.enable_input_require_grads()
|
|
|
|
|
else:
|
2023-05-09 06:47:59 +00:00
|
|
|
|
|
2023-05-08 15:40:03 +00:00
|
|
|
|
def make_inputs_require_grad(module, input, output):
|
|
|
|
|
output.requires_grad_(True)
|
2023-05-09 06:47:59 +00:00
|
|
|
|
|
|
|
|
|
raw_model.get_input_embeddings().register_forward_hook(
|
|
|
|
|
make_inputs_require_grad
|
|
|
|
|
)
|
2023-05-08 14:26:39 +00:00
|
|
|
|
peft_config = LoraConfig(
|
|
|
|
|
task_type=TaskType.CAUSAL_LM,
|
|
|
|
|
target_modules=["q_proj", "v_proj"],
|
|
|
|
|
inference_mode=False,
|
|
|
|
|
r=1,
|
|
|
|
|
lora_alpha=32,
|
|
|
|
|
lora_dropout=0.1,
|
|
|
|
|
)
|
|
|
|
|
raw_model = get_peft_model(raw_model, peft_config)
|
|
|
|
|
raw_model.print_trainable_parameters()
|
2023-05-06 15:37:17 +00:00
|
|
|
|
if config["train"].get("gradient_checkpointing_enable", False):
|
|
|
|
|
raw_model.gradient_checkpointing_enable()
|
2023-04-12 09:59:05 +00:00
|
|
|
|
trainer = Trainer(config, raw_model, train_loader, tokenizer, accelerator)
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
2023-04-12 14:16:15 +00:00
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
app.run(main)
|