diff --git a/README.md b/README.md
index 46ad9de..b3b1233 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
  * @Author: LiangSong(sl12160010@gmail.com)
  * @Date: 2023-03-10 21:18:35
  * @LastEditors: LiangSong(sl12160010@gmail.com)
- * @LastEditTime: 2023-05-04 08:33:26
+ * @LastEditTime: 2023-05-04 20:23:09
  * @FilePath: /Open-Llama/README.md
  * @Description: 
  * 
@@ -45,7 +45,7 @@ inputs = tokenizer('user:implement quick sort in python\nsystem:', return_tensor
 for k, v in inputs.items():
    inputs[k] = v.cuda()
 pred = model.generate(**inputs, max_new_tokens=512, do_sample=True)
-print(tokenizer.decode(pred.cpu()[0]).strip())
+print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
 
 ```
 只经过预训练的CheckPoint也上传至[s-JoL/Open-Llama-V1-pretrain](https://huggingface.co/s-JoL/Open-Llama-V1-pretrain)。
diff --git a/README_en.md b/README_en.md
index 03704d1..f670c29 100644
--- a/README_en.md
+++ b/README_en.md
@@ -2,7 +2,7 @@
  * @Author: LiangSong(sl12160010@gmail.com)
  * @Date: 2023-03-10 21:18:35
  * @LastEditors: LiangSong(sl12160010@gmail.com)
- * @LastEditTime: 2023-05-04 08:33:45
+ * @LastEditTime: 2023-05-04 20:23:14
  * @FilePath: /Open-Llama/README_en.md
  * @Description: 
  * 
@@ -44,7 +44,7 @@ inputs = tokenizer('user:implement quick sort in python\nsystem:', return_tensor
 for k, v in inputs.items():
    inputs[k] = v.cuda()
 pred = model.generate(**inputs, max_new_tokens=512, do_sample=True)
-print(tokenizer.decode(pred.cpu()[0]).strip())
+print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
 
 ```
 The CheckPoint after pre-training only is also uploaded to [s-JoL/Open-Llama-V1-pretrain](https://huggingface.co/s-JoL/Open-Llama-V1-pretrain).
diff --git a/chat_server.py b/chat_server.py
index aa9b57b..96f551a 100644
--- a/chat_server.py
+++ b/chat_server.py
@@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-04-06 22:30:10
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-04-29 20:40:13
+LastEditTime: 2023-05-04 22:28:07
 FilePath: /Open-Llama/chat_server.py
 Description: 
 
@@ -41,7 +41,7 @@ if "module" in ckpt:
     ckpt = ckpt["module"]
 raw_model.load_state_dict(ckpt)
 raw_model.eval()
-model = raw_model.cuda()
+model = raw_model.half().cuda()
 logging.warn("ready")
 
 
@@ -88,8 +88,10 @@ with gr.Blocks() as demo:
         context = torch.cat(context, dim=-1)
         context = context[:, -1024:]
         inputs_len = context.shape[1]
-        context = context.cuda()
-        pred = model.generate(input_ids=context, max_new_tokens=512, do_sample=True)
+        context = context.half().cuda()
+        pred = model.generate(
+            input_ids=context, max_new_tokens=1024, do_sample=True
+        )
         pred = pred[:, inputs_len:]
         pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
         logging.warn(pred)