From 1a731953da49e15ec351f9e6982644f716ce1e6d Mon Sep 17 00:00:00 2001 From: LiangSong Date: Fri, 7 Apr 2023 10:04:05 +0800 Subject: [PATCH] update server --- chat_server.py | 79 ++++++++++++++++++++++++++++++ dataset/tokenizer.py | 34 ++++++++++--- server.py | 4 +- speed_test.py | 114 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 222 insertions(+), 9 deletions(-) create mode 100644 chat_server.py create mode 100644 speed_test.py diff --git a/chat_server.py b/chat_server.py new file mode 100644 index 0000000..ae86df3 --- /dev/null +++ b/chat_server.py @@ -0,0 +1,79 @@ +""" +Author: LiangSong(sl12160010@gmail.com) +Date: 2023-04-06 22:30:10 +LastEditors: LiangSong(sl12160010@gmail.com) +LastEditTime: 2023-04-06 23:13:54 +FilePath: /Open-Llama/chat_server.py +Description: + +Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. +""" +import torch +import gradio as gr +import sentencepiece as spm +from dataset.tokenizer import Tokenizer +from transformers import LlamaForCausalLM, LlamaConfig + + +sp_model = spm.SentencePieceProcessor( + model_file="configs/10w_vocab_wudao5_pile10.model" +) +tokenizer = Tokenizer(sp_model) +raw_model = LlamaForCausalLM( + LlamaConfig( + vocab_size=tokenizer.vocab_size, + initializer_range=0.01, + pad_token_id=tokenizer.pad_id, + rms_norm_eps=1e-5, + hidden_dropout_prob=0.1, + attention_dropout_prob=0.1, + use_stable_embedding=True, + shared_input_output_embedding=True, + ) +) +ckpt = torch.load( + "data/saved_ckpt/instruction_tuning_3_epochs/37001.pt", map_location="cpu" +) +raw_model.load_state_dict(ckpt) +raw_model.eval() +model = raw_model.cuda() +print("ready") + +with gr.Blocks() as demo: + chatbot = gr.Chatbot() + msg = gr.Textbox() + clear = gr.Button("Clear") + + def user(user_message, history): + return "", history + [[user_message, None]] + + def bot(history): + context = [] + round = 0 + for prompt, completion in history: + round += 1 + if completion is None: + inputs = 'user:{}\nsystem:'.format(prompt) + inputs = tokenizer(inputs, return_tensors=True, add_special_tokens=False) + context.append(inputs['input_ids']) + else: + inputs = 'user:{}\nsystem:{}'.format(prompt, completion) + inputs = tokenizer(inputs, return_tensors=True, add_special_tokens=True) + context.append(inputs['input_ids']) + context = torch.cat(context, dim=-1) + context = context[:, -1024: ] + inputs_len = context.shape[1] + context = context.cuda() + pred = model.generate(input_ids=context, max_new_tokens=512, do_sample=True) + pred = pred[:, inputs_len:] + pred = tokenizer.decode(pred.cpu())[0] + bot_message = pred + history[-1][1] = bot_message + return history + + msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( + bot, chatbot, chatbot + ) + clear.click(lambda: None, None, chatbot, queue=False) + +demo.launch() diff --git a/dataset/tokenizer.py b/dataset/tokenizer.py index 5daee60..59f32f2 100644 --- a/dataset/tokenizer.py +++ b/dataset/tokenizer.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-20 21:39:47 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-04-05 22:35:01 +LastEditTime: 2023-04-06 23:01:50 FilePath: /Open-Llama/dataset/tokenizer.py Description: @@ -145,14 +145,34 @@ class Tokenizer: out["attention_mask"] = attention_mask return out - def decode(self, inputs): + def decode(self, inputs, max_rounds=None): inputs = inputs.tolist() out = [] - for i in inputs: - if self.eos_id in i: - eos_idx = i.index(self.eos_id) - i = i[:eos_idx] - out.append(i) + for i, ids in enumerate(inputs): + count = 0 + flag = False + for j, token in enumerate(ids): + if token == self.eos_id: + if max_rounds is None: + flag = True + break + elif isinstance(max_rounds, int): + if count < max_rounds: + count += 1 + else: + flag = True + break + elif isinstance(max_rounds, list): + if count < max_rounds[i]: + count += 1 + else: + flag = True + break + if flag: + ids = ids[: j] + else: + ids = ids + out.append(ids) out = self.sp_model.Decode(out) return out diff --git a/server.py b/server.py index 609c3ab..5d4ea82 100644 --- a/server.py +++ b/server.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-31 13:26:15 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-04-05 21:47:54 +LastEditTime: 2023-04-06 03:45:44 FilePath: /Open-Llama/server.py Description: @@ -43,7 +43,7 @@ print("ready") def question_answer(prompt): print(prompt) - raw_inputs = "user:{}system:".format(prompt) + raw_inputs = "user:{}\nsystem:".format(prompt) inputs_len = len(raw_inputs) inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False) for k, v in inputs.items(): diff --git a/speed_test.py b/speed_test.py new file mode 100644 index 0000000..3b884ed --- /dev/null +++ b/speed_test.py @@ -0,0 +1,114 @@ +# import time +# import torch +# from colossalai.nn.optimizer import HybridAdam +# from deepspeed.ops.adam import FusedAdam +# from transformers import LlamaForCausalLM, LlamaConfig +# import lightning.pytorch as pl + +# # define the LightningModule +# class LitAutoEncoder(pl.LightningModule): +# def __init__(self): +# super().__init__() + +# def training_step(self, inputs, batch_idx): +# # training_step defines the train loop. +# # it is independent of forward +# # print(inputs.shape) +# out = self.model(input_ids=inputs, labels=inputs) +# loss = out.loss +# return loss + +# def configure_optimizers(self): +# optimizer = HybridAdam(self.parameters(), lr=1e-5) +# return optimizer + +# def configure_sharded_model(self): +# self.model = LlamaForCausalLM( +# LlamaConfig( +# vocab_size=32000, +# initializer_range=0.001, +# pad_token_id=0, +# rms_norm_eps=1e-5, +# hidden_dropout_prob=0.1, +# attention_dropout_prob=0.1, +# use_stable_embedding=False, +# shared_input_output_embedding=False, +# ) +# ) + + +# # init the autoencoder +# autoencoder = LitAutoEncoder() +# trainer = pl.Trainer(limit_train_batches=500, max_epochs=1, accelerator='gpu', devices=8, strategy="colossalai", precision=16) +# class FakeSet(torch.utils.data.Dataset): +# def __getitem__(self, idx): +# return torch.randint(0, 32000, (2048, )) + +# def __len__(self): +# return 10000 +# train_loader = torch.utils.data.DataLoader(FakeSet(), batch_size=1) +# trainer.fit(model=autoencoder, train_dataloaders=train_loader) + + +# import time +# import torch +# from accelerate import Accelerator +# from deepspeed.ops.adam import FusedAdam +# from transformers import LlamaForCausalLM, LlamaConfig + + +# accelerator = Accelerator() +# raw_model = LlamaForCausalLM( +# LlamaConfig( +# vocab_size=32000, +# initializer_range=0.001, +# pad_token_id=0, +# rms_norm_eps=1e-5, +# hidden_dropout_prob=0.1, +# attention_dropout_prob=0.1, +# use_stable_embedding=False, +# shared_input_output_embedding=False, +# ) +# ) +# optimizer = FusedAdam(raw_model.parameters(), lr=1e-5) + +# import random +# import sentencepiece as spm +# from dataset.tokenizer import Tokenizer +# from dataset.data_iter import create_shard_kwargs, DataIter +# from torch.utils.data import DataLoader + +# max_length = 2048 +# tokenizer_model_path = 'configs/10w_vocab_wudao5_pile10.model' +# sp_model = spm.SentencePieceProcessor(model_file=tokenizer_model_path) +# tokenizer = Tokenizer(sp_model) + +# paths = create_shard_kwargs(['1*']) +# random.shuffle(paths) +# data_set = DataIter( +# paths +# ) +# train_loader = DataLoader( +# data_set, +# batch_size=1 +# ) + +# model, optimizer, train_loader = accelerator.prepare(raw_model, optimizer, train_loader) +# inputs = torch.randint(0, 32000, (1, 2048), device=accelerator.device) + + +# for i in range(10): +# optimizer.zero_grad() +# out = model(input_ids=inputs, labels=inputs) +# loss = out.loss +# accelerator.backward(loss) +# optimizer.step() +# start_time = time.time() +# for i in range(500): +# optimizer.zero_grad() +# out = model(input_ids=inputs, labels=inputs) +# loss = out.loss +# accelerator.backward(loss) +# optimizer.step() +# end_time = time.time() +# accelerator.print(end_time - start_time) \ No newline at end of file