Compare commits

...

86 Commits
v1.0 ... main

Author SHA1 Message Date
87f75f2dfe Fix huggingface download error 2024-08-18 06:46:44 +00:00
9b8fe37cd1 Update requirements.txt 2024-08-18 06:21:38 +00:00
eef8ae1477 Update Dockerfile 2024-06-15 18:59:00 +00:00
e6df81ae9d Add docker-compose.yml 2024-06-15 18:32:23 +00:00
bbb1d210aa Add Dockerfile 2024-06-15 18:31:53 +00:00
26a0ea81c0 Update requirements.txt 2024-06-15 17:12:47 +00:00
a24271d48e Update data/download_the_pile.sh 2024-06-14 09:08:55 +00:00
5635f7d08d Update data/download_wudao.sh 2024-06-14 09:08:03 +00:00
LiangSong
0157b6938d update readme 2023-05-17 22:45:04 +07:00
LiangSong
95973b5de1 update header 2023-05-17 22:21:46 +07:00
LiangSong
d269affb42 update readme 2023-05-17 21:17:49 +07:00
s-JoL
6988c69884
Merge pull request #64 from eltociear/patch-1
Update README.md
2023-05-16 22:24:13 +07:00
Ikko Eltociear Ashimine
7bacd6cb93
Update README.md
HuggingFace -> Hugging Face
2023-05-17 00:11:35 +09:00
LiangSong
77b1c552c3 add discord invite link 2023-05-15 23:00:25 +07:00
LiangSong
82c845a8ce update readme 2023-05-15 00:21:13 +08:00
LiangSong
1ce8c18d83 add logo 2023-05-14 10:52:43 +08:00
LiangSong
52e8df9a8d update readme 2023-05-14 10:48:49 +08:00
LiangSong
a07d9b0ac8 update readme 2023-05-14 01:06:03 +08:00
LiangSong
bf2cac0a45 update config 2023-05-14 01:00:50 +08:00
LiangSong
e18ead00cc update server 2023-05-12 15:07:46 +08:00
LiangSong
7231d53ca4 update readme add new model 2023-05-12 11:32:42 +08:00
LiangSong
ceb1fd067b update vocab_size 2023-05-11 14:15:12 +08:00
LiangSong
73dafa7ad6 add rounding vocab_size 2023-05-10 17:49:52 +08:00
LiangSong
26f7421f05 add star history to readme 2023-05-10 15:52:55 +08:00
LiangSong
72a6f81b61 update readme 2023-05-09 18:47:29 +08:00
LiangSong
7d505ea303 update readme 2023-05-09 17:03:13 +08:00
LiangSong
59b79af9d7 add comment 2023-05-09 16:53:05 +08:00
LiangSong
f6ac834ef9 update default config 2023-05-09 15:16:50 +08:00
LiangSong
21fdd25b94 update Citation 2023-05-09 15:12:49 +08:00
LiangSong
30ab306c56 update readme 2023-05-09 15:06:47 +08:00
LiangSong
32583a41a7 update wudao download and preprocess 2023-05-09 14:47:59 +08:00
LiangSong
7dc90c2558 fix typo 2023-05-09 10:46:11 +08:00
LiangSong
6814fdb59e support gradient ckpt for peft 2023-05-08 23:40:03 +08:00
LiangSong
3ba0c77053 update optimizer for lora 2023-05-08 22:56:37 +08:00
LiangSong
58586112c1 fix table 2023-05-08 22:30:02 +08:00
LiangSong
16811d0efe update readme 2023-05-08 22:29:24 +08:00
LiangSong
92caa94490 support peft 2023-05-08 22:26:39 +08:00
LiangSong
7da40f1c83 fix typo 2023-05-08 19:00:06 +08:00
LiangSong
2df3e622e9 update readme 2023-05-08 18:59:01 +08:00
LiangSong
ec2b4d6ee7 fix split by shard bug 2023-05-08 14:03:05 +08:00
LiangSong
4a1e7bb44b Optimized the structure of configs, added support for deepspeed stage3, reduced memory usage by using Auto class to load models, and added support for training 65B models. 2023-05-06 23:37:17 +08:00
LiangSong
5b1f6a4861 fix epoch bug 2023-05-06 09:45:37 +08:00
LiangSong
f893a0f5b8 update dataset 2023-05-05 19:23:16 +08:00
LiangSong
758af69c73 update science instruct-tuning datasets 2023-05-05 19:00:37 +08:00
LiangSong
d24b4cce54 update preprocess format 2023-05-05 18:20:59 +08:00
LiangSong
85caa97a6a add xP3 dataset and belle_2M 2023-05-05 17:05:41 +08:00
LiangSong
00cbdbbf26 fix typo 2023-05-04 22:55:40 +08:00
LiangSong
693e3970d9 update readme 2023-05-04 22:54:10 +08:00
LiangSong
fbb7997607 fix typo 2023-05-04 22:32:15 +08:00
LiangSong
98ffab3a97 update readme and add half to server 2023-05-04 22:28:36 +08:00
LiangSong
5c876121cb update gradio, fix code format bug 2023-05-04 18:18:52 +08:00
LiangSong
a1acc90988 fix train_tokenizer bug 2023-05-04 16:00:56 +08:00
LiangSong
51686b5fb8 add split dataset by shard option to accelerate data loading 2023-05-04 09:20:23 +08:00
LiangSong
f0d41f937b update instruct_config and set all random seed to 42 2023-05-04 08:45:21 +08:00
LiangSong
dba2e2d680 update ShareGPT_90K preprocess 2023-05-04 08:34:38 +08:00
LiangSong
154456c976 set dataset shuffle seed to 42 2023-05-04 00:31:12 +08:00
LiangSong
c2184c6dd1 support multiple epochs 2023-05-03 00:02:01 +08:00
LiangSong
f05e929aad update config 2023-05-02 21:42:55 +08:00
LiangSong
0466673f76 support load model from accelerate ckpt 2023-04-29 20:40:42 +08:00
LiangSong
52cd09f664 update readme 2023-04-29 20:30:24 +08:00
LiangSong
fc21a75d1e add continue training 2023-04-29 20:28:39 +08:00
LiangSong
28b11a5bed update requirements 2023-04-29 13:39:03 +08:00
LiangSong
8b439dec4a update flops 2023-04-29 12:31:11 +08:00
LiangSong
a2816bd23d update readme 2023-04-29 12:06:55 +08:00
LiangSong
4c5e50e4aa update readme 2023-04-29 11:41:28 +08:00
LiangSong
c8037746c3 update readme 2023-04-28 22:45:45 +08:00
s-JoL
0ff8b2353f
Merge pull request #30 from s-JoL/dev
update readme
2023-04-28 19:54:52 +08:00
LiangSong
724265b435 update readme 2023-04-28 19:54:14 +08:00
s-JoL
0fd7dbd636
Merge pull request #29 from s-JoL/dev
update readme
2023-04-28 19:50:29 +08:00
LiangSong
8c85535db3 update readme 2023-04-28 19:49:51 +08:00
LiangSong
676dcfd995 add hardward configuration to readme 2023-04-28 17:29:11 +08:00
s-JoL
f3c664bde3
Merge pull request #25 from s-JoL/dev
v2 release
2023-04-28 15:11:02 +08:00
LiangSong
c890bce69c update readme 2023-04-28 15:10:41 +08:00
LiangSong
9baebfd49c Merge branch 'main' into dev 2023-04-28 15:08:25 +08:00
LiangSong
2fd13ff075 fix typo 2023-04-28 15:05:33 +08:00
LiangSong
0fdca8b949 update readme 2023-04-28 15:01:01 +08:00
LiangSong
49118aad42 update header config and add padding to concat_multiple_sequence 2023-04-27 23:42:11 +08:00
LiangSong
db6cdb51d0 unified pre-training and instrcution-tuning both use train_lm and dataset 2023-04-27 19:42:06 +08:00
LiangSong
97aff0e051 use split_dataset_by_node instead accelerate.prepare to accelerate data loading by 50% 2023-04-27 00:04:11 +08:00
LiangSong
0377b43628 update tokenizer to LlamaTokenizer 2023-04-26 18:53:30 +08:00
LiangSong
f41f5558ec update header 2023-04-24 23:19:07 +08:00
LiangSong
f8f4cde228 using huggingface datasets to accelerate training, using open-llama to pretrain 2023-04-24 19:13:53 +08:00
LiangSong
3f62a23ee2 update format 2023-04-12 22:16:15 +08:00
LiangSong
a4aa109dd3 add trainer and utils 2023-04-12 17:59:05 +08:00
LiangSong
ae0691c509 update utils 2023-04-12 17:15:40 +08:00
LiangSong
da1c927016 update speed test 2023-04-12 17:15:07 +08:00
67 changed files with 2072 additions and 1982 deletions

28
Dockerfile Normal file
View File

@ -0,0 +1,28 @@
# Use the specified CUDA image as the base image
FROM nvidia/cuda:12.0.0-cudnn8-devel-ubuntu20.04
# Set environment variables to avoid interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive
# Change the APT package server (fix "Hash Sum Mismatch" problem)
RUN sed -i 's/archive.ubuntu.com/repo.catswords.com/g' /etc/apt/sources.list
RUN sed -i 's/security.ubuntu.com/repo.catswords.com/g' /etc/apt/sources.list
# Update the package list and install Python 3 and necessary packages
RUN apt-get update && \
apt-get install -y python3 python3-pip python3-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set Python 3 as the default python
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
# Create a working directory
WORKDIR /app
# Copy the requirements.txt file and install Python packages if available
COPY requirements.txt ./
RUN pip3 install --no-cache-dir -r requirements.txt
# Command to run the application (update this as needed)
CMD ["tail", "-f", "/dev/null"]

491
README.md
View File

@ -1,104 +1,173 @@
<!--
* @Author: LiangSong(sl12160010@gmail.com)
* @Author: s-JoL(sl12160010@gmail.com)
* @Date: 2023-03-10 21:18:35
* @LastEditors: LiangSong(sl12160010@gmail.com)
* @LastEditTime: 2023-04-16 23:49:06
* @LastEditors: s-JoL(sl12160010@gmail.com)
* @LastEditTime: 2023-05-17 22:44:35
* @FilePath: /Open-Llama/README.md
* @Description:
*
* Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
* Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
-->
[**中文**](./README_zh.md) | [**English**](./README.md)
![camel](assets/logo.png)
# Open-Llama
[English](https://github.com/Bayes-Song/Open-Llama/blob/main/README_en.md)
<p align="center">
<img alt="GitHub" src="https://img.shields.io/github/license/s-JoL/Open-Llama.svg?color=blue&style=flat-square">
<img alt="GitHub release (latest by date)" src="https://img.shields.io/github/v/release/s-JoL/Open-Llama">
<img alt="GitHub top language" src="https://img.shields.io/github/languages/top/s-JoL/Open-Llama">
<img alt="GitHub last commit" src="https://img.shields.io/github/last-commit/s-JoL/Open-Llama">
</p>
Open-Llama是一个开源项目提供了一整套用于构建大型语言模型的训练流程从数据集准备到分词、预训练、指令调优以及强化学习技术 RLHF。
Open-Llama is an open-source project that offers a complete training pipeline for building large language models, ranging from dataset preparation to tokenization, pre-training, prompt tuning, lora, and the reinforcement learning technique RLHF.
## 进展
**You can try this model directly from the [Demo](http://home.ustc.edu.cn/~sl9292/).**
**采用FastChat项目相同方法测评Open-Llama的效果和GPT3.5的效果对比经过测试在中文问题上可以达到GPT3.5 84%的水平具体测试结果和CheckPoint将在近期放出**
Join [discord](https://discord.gg/TrKxrTpnab) to discuss the development of large language models.
## **Main contents**
- **Support Transformers/HuggingFace.** The CheckPoint after Instruct-tuning is open-source on [Hugging Face: s-JoL/Open-Llama-V2](https://huggingface.co/s-JoL/Open-Llama-V2).
- **By adopting the same evaluation method as the FastChat project, Open-Llama's performance is compared to GPT3.5s. After testing, it can reach 89% of GPT3.5's performance on Chinese questions.**
- **The training speed reaches 3620 tokens/s, faster than the 3370 tokens/s reported in the original Llama paper, reaching the current state-of-the-art level.**
``` python
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("s-JoL/Open-Llama-V2", use_fast=False)
model = AutoModelForCausalLM.from_pretrained("s-JoL/Open-Llama-V2", device_map="auto")
inputs = tokenizer('user:implement quick sort in python\nsystem:', return_tensors='pt', return_attention_mask=False, add_special_tokens=False)
for k, v in inputs.items():
inputs[k] = v.cuda()
pred = model.generate(**inputs, max_new_tokens=512, do_sample=True)
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
经过Instruct-tuning的CheckPoint已开源在[HuggingFace](https://huggingface.co/s-JoL/Open-Llama-V1)。使使用ckpt需要先用下面命令安装最新版本Transformers
``` base
pip install git+https://github.com/s-JoL/transformers.git@dev
```
模型已提交[PR](https://github.com/huggingface/transformers/pull/22795)合并至Transformers main分支。
The CheckPoint after pre-training only is also uploaded to [s-JoL/Open-Llama-V2-pretrain](https://huggingface.co/s-JoL/Open-Llama-V2-pretrain).
我们完成了300B token的预训练总共训练80 K stepGlobal Batch Size和Llama中一致为4M。
使用总共7部分数据构成Instruction-tuning数据模型具有一定的编程能力、数学能力和多轮对话能力具体数据见Instruction-Tuning部分。
We have completed 330B token pre-training, training a total of 80 K steps. The Global Batch Size is consistent with Llama at 4M.
Using a total of 7 parts of data to constitute the Instruction-tuning data, the model has certain programming abilities, mathematical abilities, and multi-turn dialogue abilities. Specific data can be found in the Instruction-Tuning section.
[Demo](http://home.ustc.edu.cn/~sl9292/)
Below is a display of the model's multi-turn dialogue ability regarding code:
我们参考一些对文心一言的测试也简单测试一下我们的模型,原始报道 [百度“文心一言”测试:国内生成式 AI 什么水平?](https://www.8btc.com/article/6809666)
![image4](assets/multiturn_chat_en.jpg)
本模型的效果如下图更多结果还待进一步测试。由于国内网络问题使用上面的Demo可能出现请求丢失的情况如长时间无响应可刷新重试
![image1](assets/image1.png)![image2](assets/image2.png)![image3](assets/image3.png)
## **Updates**
下面是一个关于代码的多轮对话能力的展示
**[2023.5.8] Release v2.1**
![image4](assets/multiturn_chat.jpeg)
<!-- 我们简单预估一下达到上面效果的一个花费训练40K step使用了1.5亿条预训练数据大约为110B token总共训练时间76h按Google Cloud的A100报价花费大约为19152美元。后续的Instruction-tuning训练了12k Step使用1.6M条数据总共训练时间3.4h大约花费342美元。因此从0开始训练一个这样的模型总花费不到20000美元。 -->
- This update adds support for larger model training. Using DeepSpeed stage3 + offload + activation checkpoint, you can **train a 65B model with A100-80G**.
目前模型在数学方面和代码方面表现明显较差,这一方面和训练数据有关,另一方面我认为也是模型大小所造成的,然而这方面的逻辑推理能力是一个可用的模型所必备,因此后续更新会关注提升相关能力。
## **特性**
- The peft library is introduced to **support training such as lora**.
### 易用性
- The following table compares the training speed of Open-Llama and the original Llama, and the performance data of Llama is quoted from the original Llama paper.
我们认为易用性是构建大型语言模型时最重要的特性之一。为了使 Open-LLAMA 更加易于使用,我们特别注重了以下几点:
- **最简实现**:我们采用了最简单的实现方式,降低了入门的门槛,让初学者也能轻松上手。
- **流程完整**:我们发布了从数据集构建到训练的完整代码,使得构建一个大语言模型的每一步流程都清晰可见。
| | DeepSpeed Stage | Offload | Activation Checkpoint | Total Token | GPU hours | Speed token/s/gpu | Batch Size |
|----------------|-----------------|---------|-----------------------|-------------|-----------|-------------------|------------|
| Open-Llama 7B | 1 | False | False | 173.7B | 13412 | 3620 | 2 |
| Open-Llama 13B | 3 | False | True | - | - | 1856 | 24 |
| Open-Llama 33B | 3 | False | True | - | - | 708 | 12 |
| Open-Llama 65B | 3 | True | True | - | - | 369 | 12 |
| Llama 7B | - | - | - | 1T | 82432 | 3370 | - |
| Llama 13B | - | - | - | 1T | 135168 | 2055 | - |
| Llama 33B | - | - | - | 1.4T | 530432 | 733 | - |
| Llama 65B | - | - | - | 1.4T | 1022362 | 380 | - |
### 高性能
**[2023.4.28] Release v2.0**
由于训练大语言模型的成本高昂,因此在构建大型语言模型时,高性能也是非常重要的。为了实现高性能的训练,我们发布使用了以下技术:
This update mainly includes the following aspects, increasing the effective training speed by **50%** compared to the v1 version, reducing padding from **30%** to **5%**, and improving training speed from **3200 tokens/s** to **3620 tokens/s**. 0.95 * 3620 / (0.7 * 3200) = 1.521
- **Fused CUDA kernel**:使用[xformers](https://github.com/facebookresearch/xformers)中提供的 fused CUDA kernel 可以将多个操作融合在一起,减少了 GPU 和 CPU 之间的数据传输,从而提高了训练效率。
- **并行化训练**:我们使用[Accelerate](https://huggingface.co/docs/accelerate/index)库支持在多个 GPU 上进行并行化训练,以加快训练速度。
1. Use Hugging Face's datasets library for data reading, with the process as follows:
1. Use the transform function to unify data formats from different datasets to {'text': 'xxx'}
2. Tokenize using Tokenizer
3. Sample long sequences; currently, three modes are provided: truncation, sampling (refer to the [Gopher paper](https://arxiv.org/abs/2112.11446)), and splitting
4. Optional: concatenate texts from different docs, reducing padding in the data and accelerating training. In the v1 version, padding accounted for **30%**; after concatenation, padding is reduced to **5%**.
2. Add Trainer, which can be reused for both pre-training and instruction fine-tuning, see solver/trainer.py
3. Unify the pre-training and instruction fine-tuning training entry to train_lm.py
4. Provide more convenient configuration, see configs/pretrain_config.yaml
5. Provide functionality to continue pre-training based on other pre-trained models and supplementing vocabulary
6. Resuming training from a checkpoint is supported, including loading optimizer parameters/learning rate and skipping duplicate data
对于7B模型使用Transformers中Pytorch原生版本的Llama模型训练训练速度为1378 token/s/gpu使用本代码库训练速度达到3290 token/s/gpu基本达到[Llama原文](https://arxiv.org/pdf/2302.13971.pdf)中的3370 token/s/gpu。
如果使用500B token进行预训练需要训练43000 GPU时。按照Google Cloud上A100-80G Spot的价格计算8卡每小时价格为12.6美元则总价格为67725美元。
当使用未加速版本训练时价格为158744美元。最终降低训练成本9万美元。
更多测试可见[和其他开源模型性能对比](https://github.com/Bayes-Song/Open-Llama#%E5%92%8C%E5%85%B6%E4%BB%96%E5%BC%80%E6%BA%90%E6%A8%A1%E5%9E%8B%E6%80%A7%E8%83%BD%E5%AF%B9%E6%AF%94)。
### 通用性
[2023.4.16] Release v1.0
在训练语言模型时,我们希望能够构建一个通用的模型,可以适用于不同的语言和不同的领域。为了实现这一点,我们采用了以下策略:
Basic pre-training and instruction fine-tuning codes are provided, with a training speed comparable to that of the original Llama. The pre-trained and fine-tuned models are already open-sourced on Hugging Face.
- **多语言支持**:我们支持多种语言的语料库,包括英语、中文、日语等多种语言,让用户可以根据自己的需求进行选择。
- **领域通用性**:我们希望模型不仅能在日常问题上能产生帮助,同时希望在专业领域如科学、法律等也能帮助人类。
v1 version code can be seen at https://github.com/s-JoL/Open-Llama/tree/v1.0
## **要求**
## **Features**
- Python 3.7 或更高版本
### Easy to use
We believe that ease of use is one of the most important features when building large language models. To make Open-LLAMA more accessible, we have focused on the following aspects:
- **Minimal implementation**: We have adopted the simplest implementation methods, lowering the entry threshold and allowing beginners to get started with ease.
- **Complete pipeline**: We have published the complete code from dataset construction to training, making every step in the process of building a large language model clear and visible.
### High performance
Due to the high cost of training large language models, high performance is also crucial when building them. To achieve high-performance training, we have employed the following techniques:
- **Fused CUDA kernel**: Using the fused CUDA kernel provided in [xformers](https://github.com/facebookresearch/xformers) can fuse multiple operations, reducing data transfer between the GPU and CPU, thereby improving training efficiency.
- **Parallelized training**: We employ the [Accelerate](https://huggingface.co/docs/accelerate/index) library to support parallelized training on multiple GPUs to speed up the training process.
For a 7B model, the training speed with the native PyTorch Llama model in Transformers is **1378 tokens/s/GPU**. Using this codebase, the training speed reaches **3626 tokens/s/GPU**, exceeding **3370 tokens/s/GPU** reported in the [original Llama paper](https://arxiv.org/pdf/2302.13971.pdf).
If pre-training with 500B tokens, 38300 GPU hours are required. According to the hourly price for 8 A100-80G Spot GPUs on Google Cloud, which is 12.6 US dollars, the total cost is 60,300 US dollars.
When using the unaccelerated version for training, the cost is 158,744 US dollars. The final training cost is reduced by 98,000 US dollars.
For more testing, see [performance comparison with other open-source models](https://github.com/s-JoL/Open-Llama#%E5%92%8C%E5%85%B6%E4%BB%96%E5%BC%80%E6%BA%90%E6%A8%A1%E5%9E%8B%E6%80%A7%E8%83%BD%E5%AF%B9%E6%AF%94).
### Versatility
When training language models, our goal is to build a versatile model that can handle different languages and domains. To achieve this, we have employed the following strategies:
- **Multi-language support**: We support multiple language corpora, including English, Chinese, Japanese, and many other languages, allowing users to choose according to their requirements.
- **Domain versatility**: We hope that the model can not only help with everyday questions but also assist in professional domains such as science, law, etc.
- **Interaction with the world**: By incorporating reinforcement learning (RL), we hope to give the model the ability to interact with the world.
## **Requirements**
- Python 3.7 or higher
- PyTorch 1.13
- 特殊版本的[Transformers库](https://github.com/Bayes-Song/transformers)
- [Accelerate库](https://huggingface.co/docs/accelerate/index)
- CUDA 11.6 或更高版本(用于 GPU 加速基于CUDA11.7进行测试)
- [Transformers library](https://github.com/huggingface/transformers)
- [Accelerate library](https://huggingface.co/docs/accelerate/index)
- CUDA 11.6 or higher (for GPU acceleration)
- Hardware configuration: currently using (64 CPU, 1000G Memory, 8xA100-80G) x N. There is a rather curious phenomenon that when more CPUs are used, the system runs slightly slower. I speculate this may have something to do with the multi-processing of dataloader.
## **入门指南**
### 安装
## **Getting Started**
### Installation
使用下面的命令安装相关依赖
Use the following command to install related dependencies:
```bash
pip install -r requirements.txt
```
### 数据集准备
### Dataset Preparation
目前给出了智源开源的悟道数据集和EleutherAI开源的the pile数据集。数据集下载和处理代码在data目录下。
其中悟道数据集由于需要同意一些协议才能下载因此可能需要修改一下download_wudao中的链接[悟道](https://data.baai.ac.cn/details/WuDaoCorporaText)。
Currently provided are the Wudao dataset open-sourced by Zhiyuan and the Pile dataset open-sourced by EleutherAI. Dataset download and processing scripts are located in the data directory.
Due to the required agreement for downloading the Wudao dataset, you may need to modify the link in download_wudao. [Wudao](https://data.baai.ac.cn/details/WuDaoCorporaText).
运行下面的命令进行数据下载并进行分片
Thanks to [@skepsun](https://github.com/skepsun)'s suggestion, using scidb to download the wudao dataset does not require login, and the download is more stable. https://github.com/s-JoL/Open-Llama/issues/42.
**Note that data download may fail. It is recommended to divide the download and processing in the script into two parts for multiple attempts, which will automatically resume downloads from breakpoints.**
Run the following commands to download the data and perform partitioning:
```bash
bash data/download_the_pile.sh
bash data/download_wudao.sh
```
数据将按照每个文件最大16384行存储为小文件便于后续使用多进程训练时进行读取。存储格式为jsonl.zst使用zstd进行压缩最终数据大小为519.5G合计16466个文件。
The data will be stored as small files, with a maximum of 16384 lines per file, for easy reading during multi-process training. The storage format is jsonl.zst, compressed using zstd, with a final data size of 519.5 GB, consisting of 16,466 files in total.
其中the pile数据集包含210607728行json line悟道数据集包含59132213行json line。
The Pile dataset contains 210,607,728 JSON lines, while the Wudao dataset contains 59,132,213 JSON lines.
具体数据格式如下
The specific data format is as follows:
```
WuDao
{'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
@ -106,36 +175,68 @@ WuDao
The Pile
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
```
验证数据完整性可见 [issue](https://github.com/s-JoL/Open-Llama/issues/5)
Check the data integrity in [issue](https://github.com/s-JoL/Open-Llama/issues/5).
### 数据读取
数据读取相关代码可见dataset目录其中包含根据下载的数据集使用SentencePiece训练分词模型以及根据分词器构建DataLoader。
### Related Tools
In the utils directory, training tokenizer/supplementing existing tokenizer models and conversion checkpoint code are provided.
Use SentencePiece to train a tokenizer with the following command:
训练分词器使用如下命令
```bash
python3 dataset/train_tokenizer.py
python3 utils/train_tokenizer.py
```
使用如下命令查看DataLoader输出的结果
In configs, a tokenizer model with a 40k vocabulary, trained only using the Wudao dataset (4w_cn_vocab_wudao15.model), is provided.
To supplement the vocabulary based on an existing tokenizer model, refer to:
```bash
python3 dataset/pretrain_dataset.py
python3 utils/merge_tokenizer.py
```
### 模型结构
我们基于Transformers库中的[Llama](https://github.com/facebookresearch/llama)参考论文原文中的2.4 Efficient implementation一节进行了修改
同时还参考了一些其他论文引入了一些优化。具体来说我们引入了由META开源的[xformers库](https://github.com/facebookresearch/xformers)中的memory_efficient_attention操作来进行
Self Attention的计算这对于性能有明显的提升提升大约30%。
具体可以参见[modeling_llama.py](https://github.com/Bayes-Song/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L240)
同时我们还参考了[Bloom](https://huggingface.co/bigscience/bloom)对于Token Embedding引入了Stable Embedding以更好的稳定训练。
A bilingual English and Chinese tokenizer model (llama_tokenizer_extended.model) is created by merging the META official tokenizer model with the 40k Chinese tokenizer mentioned above.
最后我们参考[PALM](https://arxiv.org/abs/2204.02311)使用了Shared Input-Output Embeddings。
To convert existing Llama model checkpoints, refer to:
### 预训练
我们基于Accelerate库进行多GPU并行训练启动命令如下
```bash
accelerate launch --config_file configs/default_config.yaml pretrain_llama.py
python3 utils/convert_ckpt.py
```
某些情况下可能需要指定下列参数
### Data Loading
Data loading-related code can be found in dataset/dataset.py, which includes pre-training and instruction fine-tuning data processing. To add other datasets, only the transform function needs to be modified.
The data loading process is as follows:
1. Use the transform function to unify data formats from different datasets to {'text': 'xxx'}
2. Tokenize using Tokenizer
3. Sample long sequences; currently, three modes are provided: truncation, sampling (refer to the Gopher paper), and splitting
4. Optional: concatenate texts from different docs, reducing padding in the data and accelerating training. In the v1 version, padding accounted for 30%; after concatenation, padding is reduced to 5%.
Use the following command to view the output of DataLoader and check the correctness of tokenization:
```bash
python3 dataset/dataset.py
```
### Model Structure
We modified according to the section 2.4 Efficient implementation of the [Llama](https://github.com/facebookresearch/llama) paper in the Transformers library, and also referenced other papers to introduce some optimizations. Specifically, we used the memory_efficient_attention operation from the [xformers library](https://github.com/facebookresearch/xformers) open-sourced by META for Self Attention computation, which has a significant performance improvement of approximately 30%. Further details can be found in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/open_llama/modeling_open_llama.py#L229).
Additionally, we referred to [Bloom](https://huggingface.co/bigscience/bloom) and introduced Stable Embedding for Token Embedding to better stabilize training.
Finally, we referenced [PALM](https://arxiv.org/abs/2204.02311) and employed Shared Input-Output Embeddings.
### Pre-training
We use multi-GPU parallel training based on the Accelerate library, with the following start command:
```bash
accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/pretrain_config.yaml --model_config configs/model_configs/7B.json
```
In some cases, you may need to specify the following parameters:
```
--main_process_ip
--main_process_port
@ -143,64 +244,68 @@ accelerate launch --config_file configs/default_config.yaml pretrain_llama.py
--num_machines
--machine_rank
```
我们使用[Wandb](https://wandb.ai/)进行训练的可视化,需要自行修改环境变量 WANDB_API_KEY 。
其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/default_config.yaml。
We use [Wandb](https://wandb.ai/) for visualizing training. You need to modify the WANDB_API_KEY environment variable yourself.
训练相关超参数可见configs/train_config.py目前我们使用10W词表的7B Llama模型进行训练具体配置如下
Among them, we use DeepSpeed stage1 to reduce memory usage. For Accelerate-related configurations, see configs/accelerate_configs.
Training related hyperparameters can be found in configs/pretrain_config.yaml.
The default parameters use LlamaTokenizer with a supplemented 40k Chinese vocabulary tokenizer model, and the model size is 7B. The specific configuration is as follows:
| max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size |
|------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------|
| 1024 | 2 | 2e-4 | 1e-1 | 6.88B | 4096 | 32 | 32 | 100000 |
| 2048 | 2 | 2e-4 | 1e-1 | 7.03B | 4096 | 32 | 32 | 68762 |
```
=========================================================================================================
Layer (type:depth-idx) Output Shape Param #
=========================================================================================================
LlamaForCausalLM [1, 64, 32, 128] --
├─LlamaModel: 1-1 [1, 64, 32, 128] --
│ └─Embedding: 2-1 [1, 64, 4096] 409,600,000
│ └─LayerNorm: 2-2 [1, 64, 4096] 8,192
└─ModuleList: 2-3 -- --
│ └─LlamaDecoderLayer: x32 [1, 64, 4096] 202,383,360 x 32
│ └─LlamaRMSNorm: 2-4 [1, 64, 4096] 4,096
=========================================================================================================
Total params: 6,885,879,808
Trainable params: 6,885,879,808
==============================================================================================================
Layer (type:depth-idx) Output Shape Param #
==============================================================================================================
OpenLlamaForCausalLM [1, 32, 64, 128] --
├─OpenLlamaModel: 1-1 [1, 32, 64, 128] --
│ └─Embedding: 2-1 [1, 64, 4096] 281,649,152
│ └─ModuleList: 2-2 -- --
│ └─OpenLlamaDecoderLayer: 3x32 [1, 64, 4096] 202,383,360
└─OpenLlamaRMSNorm: 2-3 [1, 64, 4096] 4,096
├─Linear: 1-2 [1, 64, 68762] 281,649,152
==============================================================================================================
Total params: 7,039,569,920
Trainable params: 7,039,569,920
Non-trainable params: 0
Total mult-adds (G): 6.89
Total mult-adds (G): 7.04
```
目前的进展
![](assets/pretrain_loss.png)
Pre-training loss from scratch is shown below:
![loss](assets/pretrain_loss.png)
### Instruction-Tuning
我们使用目前开源的三个数据集进行Instruction-tuning后续会加入更多的任务以及自己构建的数据集。
We use the currently available seven datasets for Instruction-tuning, and more tasks and our own datasets will be added later.
- [yizhongw/self_instruct](https://huggingface.co/datasets/yizhongw/self_instruct)
- [BelleGroup/train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K)
- [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
其中ShareGPT52K数据在datastes的处理有些问题我们直接下载原数据重新进行了处理。
我们对原始数据进行了一些预处理,格式如下
The ShareGPT_Vicuna_unfiltered dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it.
We performed some preprocessing on the original data, with the format as follows:
```
user: {prompt}\nsystem: {completion}</s>
```
具体训练代码和预训练基本一样,代码可见
```
instruction_tuning.py
The startup command is basically the same as pre-training:
```bash
accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/instruct_config.yaml --model_config configs/model_configs/7B.json
```
启动命令也基本一致
```bash
accelerate launch --config_file configs/default_config.yaml instruction_tuning.py
```
某些情况下可能需要指定下列参数
In some cases, you may need to specify the following parameters:
```
--main_process_ip
--main_process_port
@ -209,163 +314,89 @@ accelerate launch --config_file configs/default_config.yaml instruction_tuning.p
--machine_rank
```
过程中Loss如下基本在波动不怎么下降
The loss during the process is shown below, with a total of 3 epochs:
![loss](assets/instruct_loss.png)
### RLHF
暂无
Not available yet.
### Server
单轮对话使用server.py对于多轮对话使用chat_server.py
For multi-turn dialogue, use chat_server.py.
基于Gradio开发。
## 性能对比
Developed based on Gradio.
### 训练框架
在训练框架方面我们测试了HuggingFace开源的Accelerate库和HPC-AI开源的ColossalAI我们测试在打满显卡时性能差异较小。因此最终选择了实现相对简单的Accelerate库作为训练框架
## Performance Comparison
测试数据如下,测试过程中使用的模型结构为
### Training Framework
In terms of training frameworks, we tested Hugging Face's open-source Accelerate library, PyTorch Lightning, and HPC-AI's open-source ColossalAI. We found that their performance differences are relatively small when fully utilizing GPUs. Therefore, we chose the relatively simple-to-implement Accelerate library as the training framework.
The test code can be found in utils/speed_test.py.
The model structure used during the testing process is:
| Model | n gpu | n layer | n heads | hidden size | vocab size | seq length |
|-------|-------|---------|---------|-------------|------------|------------|
| GPT2 | 2 | 6 | heads | 4096 | 250100 | 1024 |
测试结果如下,可以看到当打满时速度和显存相差不大
| | HuggingFace | HuggingFace | ColossalAI | ColossalAI | ColossalAI |
The test results are shown below, indicating that when the GPUs are fully utilized, the differences in speed and memory consumption are not significant.
| | Hugging Face | Hugging Face | ColossalAI | ColossalAI | ColossalAI |
|-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------|
| config | without activation ckpt, bs2 | without activation ckpt, max_bs=12 | with activation ckpt, bs2 | without activation ckpt, bs2 | without activation ckpt, max_bs=10 |
| second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25 | 0.347 | 0.308, fw=0.067, bw=0.152, opt=0.088 | 1.055 |
| gpu memory | nvidia-smi 45445 | | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps, 疑似有内存泄漏 |
| gpu memory | nvidia-smi 45445 | | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps |
### 性能优化
在最早版本中我们使用DeepSpeed stage2 + Transformers中的原生Llama实现进行训练但是速度和论文中所说的相差较大因此后续我们进行了一系列的优化我们将每一步的性能提升列在下面可供参考。
### Performance Optimization
论文中提到对于6.7B模型使用了1T token进行训练最终的gpu时为82432因此可以计算出他的训练速度大致为3370 token/s/gpu。
当使用下面的优化后速度开源基本和论文中速度一致使用20x8 A100-80G进行测试。预计加入更多融合算子开源取得更好的性能。
In the earliest version, we used the native Llama implementation from DeepSpeed stage2 + Transformers for training. However, the speed was significantly different from what was claimed in the paper. Therefore, we carried out a series of optimizations afterwards, and we list each step of the performance improvement below for reference.
| | V1 | V2 |
|---------------------|--------------|-----------------------|
| Model | Transformers | Transformers+xformers |
| Optimizer | Pytorch Adam | Fused Adam |
| DeepSpeed | stage2 | stage1 |
| Grad Accumulation | 4 | 12 |
| Return Padding Mask | yes | no |
| Speed token/s/gpu | 1378 | 3290 |
The paper mentioned that for the 6.7B model, 1T token was used for training and the final GPU time was 82432, from which the training speed was roughly calculated as 3370 token/s/gpu. After using the following optimizations, the speed is now basically consistent with what was claimed in the paper when tested on 20x8 A100-80G. It is expected that more fusion operators will be added in the future to achieve better performance.
### 和其他开源模型性能对比
下表是一个对目前开源模型性能的一个总结使用GPU device均为A100由于模型大小各不相同结构也有一定差异难以准确的对比性能作为一个粗略估计可以认为速度和模型参数量基本呈反比关系这一点看Llama不同大小的模型可以得到印证。基于这个粗略估计可以看到使用本项目的性能明显由于其他项目。
| | V1 | V2 |
|---------------------|--------------|------------------------------------|
| Dataset | self implemented | datasets |
| Model | Transformers | Transformers+xformers |
| Optimizer | Pytorch Adam | Fused Adam |
| DeepSpeed | stage2 | stage1 |
| Grad Accumulation | 4 | 12 |
| Return Padding Mask | yes | no |
| Speed token/s/gpu | 1378 | 3637 |
### Comparison with Other Open-source Models
The following table summarizes the performance of currently available open-source models. In all cases, the GPU device used is A100. Due to differences in the size and structure of the models, it is difficult to make accurate performance comparisons. As a rough estimate, it can be assumed that the speed is generally inversely proportional to the size of the model parameters, which is confirmed by the performance of Llama with models of different sizes. Based on this rough estimate, it can be seen that the performance using our project is significantly better than that of other projects.
| Model | Open-Llama | LLAMA | LLAMA | LLAMA | OPT | Bloom | GLM | GPT-NEOX | CPM-ANT | CodeGeeX |
|---------------------|------------|----------|---------|-----------|---------|--------------------|-------|----------|---------|-----------|
| Model size | 6.9B | 6.7B | 13B | 65B | 175B | 175B | 130B | 20B | 10B | 13B |
| Model size | 7.0B | 6.7B | 13B | 65B | 175B | 175B | 130B | 20B | 10B | 13B |
| Token | | 1T | 1T | 1.4T | 180B | 366B | 400B | 402B | 200B | 13.9B |
| GPU Hour | | 82,432 | 135,168 | 1,022,362 | 809,472 | 1,082,990 | 43776 | 175680 | 47040 | 3072 |
| speed token/s/gpu | 3290 | 3370 | 2055 | 380 | 61.8 | 93.9 | 105.7 | 635.6 | 1181 | 1257 |
| 相关依赖 | xformers | xformers | | | measeq | Megatron-DeepSpeed | | | BMtrain | MindSpore |
| speed token/s/gpu/B | 22701 | 22579 | 26715 | 24700 | 10815 | 16432 | 13741 | 12712 | 11810 | 16341 |
| speed token/s/gpu | 3637 | 3370 | 2055 | 380 | 61.8 | 93.9 | 105.7 | 635.6 | 1181 | 1257 |
| 相关依赖 | xformers | xformers | | | measeq | Megatron-DeepSpeed | | | BMtrain | MindSpore |
| speed token*params B/s/gpu | 25728 | 22579 | 26715 | 24700 | 10815 | 16432 | 13741 | 12712 | 11810 | 16341 |
## 后续计划
## Future Plans
1. 加入更多训练监控,比如训练数据类别的分布等,加入继续训练相关代码
2. 开源预训练好的多语言Llama 6.9B的checkpoint
3. 实现Instruction-tuning代码并开源相关checkpoint
4. 使用Gradio搭建在线Demo
5. 使用[Triton](https://github.com/openai/triton)加入更多高性能算子,进一步提升性能
6. 加入根据Common Crawl构建预训练数据集相关代码并开源相关数据集
7. 加入多模态训练代码
1. Integrate RLHF code.
2. Use Triton to add more high-performance operators to further improve performance.
3. Add code for building pre-training datasets based on Common Crawl and open related datasets.
4. Add code for multimodal training.
## 引用
## Citation
```
@misc{openllama,
title={Open-Llama},
author={Liang Song},
author={s-JoL},
year={2023},
howpublished={\url{https://github.com/Bayes-Song/Open-Llama}},
howpublished={\url{https://github.com/s-JoL/Open-Llama}},
}
```
<!-- 一些之前没注意到的部分
1. [GPT3](https://arxiv.org/pdf/2005.14165.pdf), Details of Model Training
During training we always train on sequences of the full nctx = 2048 token context window, packing multiple documents into a single sequence when documents are shorter than 2048, in order to increase computational efficiency. Sequences with multiple documents are not masked in any special way but instead documents within a sequence are delimited with a special end of text token, giving the language model the information necessary to infer that context separated by the end of text token is unrelated. This allows for efficient training without need for any special sequence-specific masking.
在[PALM](https://arxiv.org/pdf/2204.02311.pdf)中也有类似的说法
Sequence length A sequence length of 2048 was used for all models. Input examples are concatenated together and then split into sequences of exactly 2048 tokens, so that there are no padding tokens, but examples may be split in the middle. Input examples are differentiated from one another with a special [eod] token.
2. GPT3, Common Crawl Filtering
使用高质量文本作为正例其他所有样本作为负例。根据判为正例的概率作为筛选np.random.pareto(α) > 1 document_score。
思想是尽量使用和高质量样本相似的数据。
The classifier is trained using logistic regression classifier with features from Sparks standard tokenizer and HashingTF.
3. GPT3, fuzzy deduplication
使用MinHashLSH进行去重同时把CC中的WebText部分数据去掉。这些特征和分类器使用的一致。
we fuzzily deduplicated documents (i.e. removed documents with high overlap with other documents) within each dataset using Sparks MinHashLSH implementation with 10 hashes
4. GPT3, Test Set Contamination
5. [The pile](https://arxiv.org/pdf/2101.00027.pdf), BPB(bits per UTF-8 encoded byte)/bits per character/perplexity
$
BPB = = (L_T /L_B)l/ ln(2) \\
perplexity = P(w1, w2, w3, w4, ...)^{-\frac{1}{N}} \\
bpc=-\frac{1}{T}\sum_i log_2 P(w_i|w1, w2, ..., w_{i-1}) \\
2^{bpc}=(\prod_i P(w_i|w1, w2, ..., w_{i-1}))^{-\frac{1}{T}}=perplexity
$
bpc是字符粒度和分词算法相关。而bpb为byte粒度与分词算法无关。
可以使用bpb的差异衡量不同数据的难度。
6. The pile, diversity of the collected data
数据多样性
We hypothesize that this is due to the perplexity based filtering used in CC-100, where a language model is trained on Wikipedia and all data with a perplexity too high or too low is discarded. This effectively discards any data too similar to or too different from Wikipedia, which severely limits the diversity of the collected data.
7. The pile, bytes per token
Since the GPT-2 BPE tokenizer is trained on WebText, the mean bytes per token is also a very rough indicator of how syntactically different each Pile component is from WebText.
8. The pile, Deduplication
We used 10 hash functions for each Minhash and an approximate Jaccard similarity of 0.5.
9. GLM, Embedding Layer Gradient Shrink
和stable embedding类似
$
word-embedding = word-embedding*\alpha+word-embedding.detach() (1\alpha)
$
10. PALM, Training Instability
训练中的loss尖峰是由特定的数据和特定的参数共同造成使用模型回滚+跳过部分数据解决。
Instead, we found that a simple strategy to effectively mitigate the issue: We re-started training from a checkpoint roughly 100 steps before the spike started, and skipped roughly 200500 data batches, which cover the batches that were seen before and during the spike. With this mitigation, the loss did not spike again at the same point. We do not believe that the spikes were caused by “bad data” per se, because we ran several ablation experiments where we took the batches of data that were surrounding the spike, and then trained on those same data batches starting from a different, earlier checkpoint. In these cases, we did not see a spike. This implies that spikes only occur due to the combination of specific data batches with a particular model parameter state
11. [Chinchilla](https://arxiv.org/pdf/2203.15556.pdf), Optimal model scaling
20 tokens per parameter, for example 10B model should use 200B tokens to pretrain
12. [Gopher](https://arxiv.org/pdf/2112.11446.pdf), Quality Filtering
Quality Filtering (MassiveWeb only) The vast majority of text found on the web is of insufficient
quality to be useful for language model training. For example, many web pages contain primarily
automatically generated content, or text that is not intended for human consumption (such as keywords
for search-engine optimisation). Much of the web also comprises social media content, which can
variously lack context, coherence, or substance. To remove low-quality data while minimising potential
for bias, we apply a number of simple, easily understood heuristic filters: we remove any document
that does not contain between 50 and 100,000 words, or whose mean word length is outside the
range of 3 to 10 characters; we remove any document with a symbol-to-word ratio greater than 0.1
for either the hash symbol or the ellipsis; and we remove any document with more than 90% of lines
starting with a bullet point, or more than 30% ending with an ellipsis. We also require that 80%
of words in a document contain at least one alphabetic character, and apply a "stop word" filter, to
remove documents that do not contain at least two of the following English words: the, be, to, of, and,
that, have, with; this adequately deals with ostensibly English documents that contain no coherent
English text.
13. Gopher, Constructing Token Sequences
和GPT3中的避免mask的方法类似
-->
<p align="center">
<a href="https://star-history.com/#s-JoL/Open-Llama&Date">
<img src="https://api.star-history.com/svg?repos=s-JoL/Open-Llama&type=Date" alt="Star History Chart">
</a>
</p>

View File

@ -1,274 +0,0 @@
<!--
* @Author: LiangSong(sl12160010@gmail.com)
* @Date: 2023-03-10 21:18:35
* @LastEditors: LiangSong(sl12160010@gmail.com)
* @LastEditTime: 2023-04-16 23:49:28
* @FilePath: /Open-Llama/README_en.md
* @Description:
*
* Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
-->
# Open-Llama
Translated by ChatGPT.
Open-Llama is an open source project that provides a complete set of training processes for building large-scale language models, from data preparation to tokenization, pre-training, instruction tuning, and reinforcement learning techniques such as RLHF.
## Progress
The checkpoint after Instruct-tuning has been open-sourced on [HuggingFace](https://huggingface.co/s-JoL/Open-Llama-V1).
To use the checkpoint, you need to first install the latest version of Transformers using the following command.
``` base
pip install git+https://github.com/s-JoL/transformers.git@dev
```
We completed pre-training on 300 billion tokens, with a total of 80,000 steps trained, using a global batch size of 4 million, consistent with Llama. We constructed the instruction-tuning dataset using a total of 7 parts of data, which the model has certain programming ability, mathematical ability, and multi-turn dialogue ability. For specific data, please refer to the instruction-tuning section.
[Demo](http://home.ustc.edu.cn/~sl9292/)
We tested our model by referring to some tests for Wenxin Yiyuan. Original report can be found at Baidu ["Wenxin Yiyan" Test: What is the level of domestic generative AI?](https://www.8btc.com/article/6809666)
The results of our model are shown in the following figure, and more results are yet to be further tested. Due to domestic network problems, the use of the above Demo may result in a request loss situation. If there is no response for a long time, please refresh and try again.
![image1](assets/eng1.png)![image2](assets/eng2.png)![image3](assets/eng3.png)
Here is a demonstration of the model's ability in multi-turn dialogue about code.
![image4](assets/multiturn_chat_en.jpeg)
We roughly estimate the cost to achieve the above results. The 40K-step pre-training used 150 million pre-training data, which is about 110B tokens. The total training time is 76 hours, and the cost is about $19,152 according to Google Cloud's A100 quotation. The Instruction-tuning training was carried out for 12k steps, using 1.6 million data, and the total training time was 3.4 hours, costing about $342. Therefore, the total cost of training such a model from scratch is less than $20,000.
Currently, the model's performance in both mathematical and code-related tasks is noticeably poor. This is partially due to the training data used, but I also believe it is due to the size of the model. However, the ability to perform logical reasoning is essential for any usable model. Therefore, future updates will focus on improving this aspect of the model's capabilities.
## **Features**
### Ease of Use
We believe that ease of use is one of the most important features when building large-scale language models. To make Open-Llama more accessible, we focus on the following:
- **Minimal implementation**: We use the simplest implementation approach to reduce the barrier to entry and make it easy for beginners to get started.
- **Complete workflow**: We provide complete code from data set construction to training, making each step of building a large language model clear and visible.
### High Performance
Since training large language models is costly, high performance is also crucial when building large-scale language models. To achieve high-performance training, we employ the following techniques:
- **Fused CUDA kernel**: Using fused CUDA kernels provided by [xformers](https://github.com/facebookresearch/xformers) can fuse multiple operations together, reducing data transfer between GPU and CPU, and improving training efficiency.
- **Parallel training**: We use the [Accelerate](https://huggingface.co/docs/accelerate/index) library to support parallel training on multiple GPUs, accelerating the training process.
For 7B mode, the training speed of the Llama model using the PyTorch native version in the Transformers library is 1378 tokens/s/GPU. With our code, the training speed reaches 3290 tokens/s/GPU, which is close to the reported 3370 tokens/s/GPU in the [Llama paper](https://arxiv.org/pdf/2302.13971.pdf).
If we pretrain with 500 billion tokens, it will take 43,000 GPU hours. Assuming the price of A100-80G Spot on Google Cloud is $12.6 per hour for 8 GPUs, the total cost will be $67,725.
Without acceleration, the cost would be $158,744. Our method reduces the training cost by $90,019 in total.
More comparison can be found in [Comparison of Performance with Other Open-Source Models](https://github.com/Bayes-Song/Open-Llama/blob/main/README_en.md#performance-comparison-with-other-open-source-models).
### Universality
When training language models, we aim to build a universal model that can be used for different languages and fields. To achieve this, we adopt the following strategies:
- **Multi-language support**: We support a variety of language corpora, including English, Chinese, Japanese, and other languages, allowing users to choose according to their needs.
- **Field universality**: We hope that the model can not only help with everyday problems but also assist in professional fields such as science and law.
## **Requirements**
- Python 3.7 or higher
- PyTorch 1.13
- Customized [Transformers library](https://github.com/Bayes-Song/transformers)
- [Accelerate library](https://huggingface.co/docs/accelerate/index)
- CUDA 11.6 or higher version (for GPU acceleration, tested based on CUDA 11.7)
## **Getting Started**
### Installation
Use the following command to install the required dependencies:
```bash
pip install -r requirements.txt
```
### Dataset Preparation
Currently, we provide the Wudao dataset from ZhuiyiAI and The Pile dataset from EleutherAI. The code for downloading and processing the datasets can be found in the data directory. Please note that the Wudao dataset requires agreeing to some agreements before downloading, so you may need to modify the link in download_wudao.sh. [WuDao](https://data.baai.ac.cn/details/WuDaoCorporaText)
Use the following commands to download and shard the data:
```bash
bash data/download_the_pile.sh
bash data/download_wudao.sh
```
The data will be stored as small files with a maximum of 16,384 lines per file for efficient multi-processing training. The storage format is jsonl.zst compressed with zstd, resulting in a total data size of 519.5 GB and 16,466 files.
The Pile dataset contains 210,607,728 rows of JSON lines, and the Wudao dataset contains 59,132,213 rows of JSON lines.
The specific data format is as follows:
```
WuDao
{'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
The Pile
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
```
Verification of data intergrity can be foud in this [issue]((https://github.com/s-JoL/Open-Llama/issues/5)
### Data Loading
The code for loading data can be found in the dataset directory, which includes training a tokenizer using SentencePiece and constructing a DataLoader based on the tokenizer.
Train the tokenizer with the following command:
```bash
python3 dataset/train_tokenizer.py
```
Check the DataLoader output with the following command:
```bash
python3 dataset/pretrain_dataset.py
```
### Model Structure
We modified the [Llama](https://github.com/facebookresearch/llama) model in the Transformers library based on section 2.4 "Efficient Implementation" in the original paper and introduced some optimizations from other papers. Specifically, we introduced the memory_efficient_attention operation from the [xformers library](https://github.com/facebookresearch/xformers) by META for computing self-attention, which significantly improves performance by about 30%. Please refer to modeling_llama.py for details.
We also referred to Bloom for introducing stable embeddings for better training of token embeddings.
Finally, we referred to PALM and used shared input-output embeddings.
### Pretraining
We use the Accelerate library for multi-GPU parallel training. Launch training with the following command:
```bash
accelerate launch --config_file configs/default_config.yaml pretrain_llama.py
```
In some cases, it may be necessary to specify the following parameters.
```
--main_process_ip
--main_process_port
--num_processes
--num_machines
--machine_rank
```
We use [Wandb](https://wandb.ai/) for training visualization and you need to modify the environment variable WANDB_API_KEY.
We use DeepSpeed stage 1 to reduce GPU memory usage. Accelerate-related configurations can be found in configs/default_config.yaml.
The training-related hyperparameters can be found in configs/train_config.py. We currently train a 7B Llama model with a vocabulary size of 100,000, and the specific configuration is as follows:
| max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size |
|------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------|
| 1024 | 2 | 2e-4 | 1e-1 | 6.88B | 4096 | 32 | 32 | 100000 |
```
=========================================================================================================
Layer (type:depth-idx) Output Shape Param #
=========================================================================================================
LlamaForCausalLM [1, 64, 32, 128] --
├─LlamaModel: 1-1 [1, 64, 32, 128] --
│ └─Embedding: 2-1 [1, 64, 4096] 409,600,000
│ └─LayerNorm: 2-2 [1, 64, 4096] 8,192
│ └─ModuleList: 2-3 -- --
│ │ └─LlamaDecoderLayer: x32 [1, 64, 4096] 202,383,360 x 32
│ └─LlamaRMSNorm: 2-4 [1, 64, 4096] 4,096
=========================================================================================================
Total params: 6,885,879,808
Trainable params: 6,885,879,808
Non-trainable params: 0
Total mult-adds (G): 6.89
```
Current Progress
![](assets/loss.png)
### Instruction-Tuning
We performed instruction-tuning on three currently available open-source datasets, and we plan to add more tasks and our own constructed datasets in the future.
- [yizhongw/self_instruct](https://huggingface.co/datasets/yizhongw/self_instruct)
- [BelleGroup/generated_train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/generated_train_0.5M_CN)
- [BelleGroup/generated_train_1M_CN](https://huggingface.co/datasets/BelleGroup/generated_train_1M_CN)
- [BelleGroup/train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K)
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
There were some issues with the handling of ShareGPT52K dataset in the processing of the datasets. We downloaded the original data again and reprocessed it.
We did some preprocessing on the raw data, the format is as follows:
```
user: {prompt}\nsystem: {completion}</s>
```
The training code is similar to pre-training and can be seen in
```
instruction_tuning.py
```
The launch command is also similar to pre-training:
```bash
accelerate launch --config_file configs/default_config.yaml instruction_tuning.py
```
In some cases, the following parameters may need to be specified:
```
--main_process_ip
--main_process_port
--num_processes
--num_machines
--machine_rank
```
The loss during the process is as follows, basically fluctuating and not decreasing much:
![loss](assets/instruct_loss.png)
### RLHF
N/A
### Server
Use server.py for single-turn conversation, and chat_server.py for multi-turn conversation.
Developed based on Gradio.
## Performance Comparison
### Training Framework
In terms of the training framework, we tested the HuggingFace's Accelerate library and HPC-AI's ColossalAI, and found that there was little difference in performance when running on fully utilized GPUs. Therefore, we ultimately chose the relatively simple Accelerate library as our training framework.
The test data is shown below, and the model structure used during testing is:
| Model | n gpu | n layer | n heads | hidden size | vocab size | seq length |
|-------|-------|---------|---------|-------------|------------|------------|
| GPT2 | 2 | 6 | heads | 4096 | 250100 | 1024 |
The test results are shown below, and we can see that there is little difference in speed and memory utilization when running on fully utilized GPUs:
| | HuggingFace | HuggingFace | ColossalAI | ColossalAI | ColossalAI |
|-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------|
| config | without activation ckpt, bs2 | without activation ckpt, max_bs=12 | with activation ckpt, bs2 | without activation ckpt, bs2 | without activation ckpt, max_bs=10 |
| second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25 | 0.347 | 0.308, fw=0.067, bw=0.152, opt=0.088 | 1.055 |
| gpu memory | nvidia-smi 45445 | | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps, suspected memory leak |
### Performance Optimization
In the earliest version, we used DeepSpeed stage2 and the native Llama implementation in Transformers for training. However, the speed was significantly different from what was reported in the paper. Therefore, we conducted a series of optimizations and list the performance improvements for each step below.
The paper mentions that they trained the 6.7B model with 1T tokens, and the GPU utilization was 82432, so the training speed was approximately 3370 tokens/s/GPU. After implementing the following optimizations, our speed is now comparable to that reported in the paper, using 20x8 A100-80G for testing. We expect to achieve better performance by adding more fusion operators in the future.
| | V1 | V2 |
|---------------------|--------------|-----------------------|
| Model | Transformers | Transformers+xformers |
| Optimizer | Pytorch Adam | Fused Adam |
| DeepSpeed | stage2 | stage1 |
| Grad Accumulation | 4 | 12 |
| Return Padding Mask | yes | no |
| Speed token/s/gpu | 1378 | 3290 |
### Performance Comparison with Other Open-source Models
The following table summarizes the performance of current open-source models, all tested on A100 GPUs. Due to differences in model sizes and structures, it is difficult to make accurate performance comparisons. As a rough estimate, it can be assumed that speed and model parameter count are inversely proportional, as evidenced by Llama models of different sizes. Based on this rough estimate, it can be seen that the performance using our project is significantly better than other projects.
| Model | Open-Llama | LLAMA | LLAMA | LLAMA | OPT | Bloom | GLM | GPT-NEOX | CPM-ANT | CodeGeeX |
|---------------------|------------|----------|---------|-----------|---------|--------------------|-------|----------|---------|-----------|
| Model size | 6.9B | 6.7B | 13B | 65B | 175B | 175B | 130B | 20B | 10B | 13B |
| Token | | 1T | 1T | 1.4T | 180B | 366B | 400B | 402B | 200B | 13.9B |
| GPU Hour | | 82,432 | 135,168 | 1,022,362 | 809,472 | 1,082,990 | 43776 | 175680 | 47040 | 3072 |
| speed token/s/gpu | 3290 | 3370 | 2055 | 380 | 61.8 | 93.9 | 105.7 | 635.6 | 1181 | 1257 |
| Dependencies | xformers | xformers | | | measeq | Megatron-DeepSpeed | | | BMtrain | MindSpore |
| speed token/s/gpu/B | 22701 | 22579 | 26715 | 24700 | 10815 | 16432 | 13741 | 12712 | 11810 | 16341 |
## Future Plans
1. Add more training monitoring, such as the distribution of training data categories, and add code for continuing training.
2. Realease the pre-trained checkpoint for the multi-lingual Llama 6.9B model.
3. Implement instruction-tuning code and open-source related checkpoints.
Build an online demo using Gradio.
4. Use [Triton](https://github.com/openai/triton) to add more high-performance operators and further improve performance.
5. Add code for building pre-training datasets based on Common Crawl and open-source related datasets.
6. Add code for multi-modal training.
## Citation
```
@misc{openllama,
title={Open-Llama},
author={Liang Song},
year={2023},
howpublished={\url{https://github.com/Bayes-Song/Open-Llama}},
}
```

464
README_zh.md Normal file
View File

@ -0,0 +1,464 @@
<!--
* @Author: s-JoL(sl12160010@gmail.com)
* @Date: 2023-03-10 21:18:35
* @LastEditors: s-JoL(sl12160010@gmail.com)
* @LastEditTime: 2023-05-17 22:43:46
* @FilePath: /Open-Llama/README_zh.md
* @Description:
*
* Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
-->
[**中文**](./README_zh.md) | [**English**](./README.md)
![camel](assets/logo.png)
# Open-Llama
<p align="center">
<img alt="GitHub" src="https://img.shields.io/github/license/s-JoL/Open-Llama.svg?color=blue&style=flat-square">
<img alt="GitHub release (latest by date)" src="https://img.shields.io/github/v/release/s-JoL/Open-Llama">
<img alt="GitHub top language" src="https://img.shields.io/github/languages/top/s-JoL/Open-Llama">
<img alt="GitHub last commit" src="https://img.shields.io/github/last-commit/s-JoL/Open-Llama">
</p>
Open-Llama是一个开源项目提供了一整套用于构建大型语言模型的训练流程从数据集准备到分词、预训练、指令调优lora, 以及强化学习技术 RLHF。
**可从[Demo](http://home.ustc.edu.cn/~sl9292/)直接试用本模型。**
加入[discord](https://discord.gg/TrKxrTpnab)一起讨论大语言模型的发展。
## **主要内容**
- **支持Transformers/Hugging Face直接调用。** 经过Instruct-tuning的CheckPoint已开源在[Hugging Face: s-JoL/Open-Llama-V2](https://huggingface.co/s-JoL/Open-Llama-V2)。
- **采用FastChat项目相同方法测评Open-Llama的效果和GPT3.5的效果对比经过测试在中文问题上可以达到GPT3.5 89%的水平。**
- **训练速度达到3620 token/s快于Llama原文中的3370 token/s达到目前sota的水平。**
``` python
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("s-JoL/Open-Llama-V2", use_fast=False)
model = AutoModelForCausalLM.from_pretrained("s-JoL/Open-Llama-V2", device_map="auto")
inputs = tokenizer('user:implement quick sort in python\nsystem:', return_tensors='pt', return_attention_mask=False, add_special_tokens=False)
for k, v in inputs.items():
inputs[k] = v.cuda()
pred = model.generate(**inputs, max_new_tokens=512, do_sample=True)
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
```
只经过预训练的CheckPoint也上传至[s-JoL/Open-Llama-V2-pretrain](https://huggingface.co/s-JoL/Open-Llama-V2-pretrain)。
我们完成了330B token的预训练总共训练80 K stepGlobal Batch Size和Llama中一致为4M。
使用总共7部分数据构成Instruction-tuning数据模型具有一定的编程能力、数学能力和多轮对话能力具体数据见Instruction-Tuning部分。
如下是一个关于代码的多轮对话能力的展示
![image4](assets/multiturn_chat.jpeg)
## **更新**
**[2023.5.8] Release v2.1**
- 本次更新加入对更大模型训练的支持使用DeepSpeed stage3 + offload + activation checkpoint可以在**A100-80G训练65B模型**。
- 引入peft库**支持lora**等训练。
- 下表对比了Open-Llama和Llama原文的训练速度Llama性能数据引自Llama原文。
| | DeepSpeed Stage | Offload | Activation Checkpoint | Total Token | GPU hours | Speed token/s/gpu | Batch Size |
|----------------|-----------------|---------|-----------------------|-------------|-----------|-------------------|------------|
| Open-Llama 7B | 1 | False | False | 173.7B | 13412 | 3620 | 2 |
| Open-Llama 13B | 3 | False | True | - | - | 1856 | 24 |
| Open-Llama 33B | 3 | False | True | - | - | 708 | 12 |
| Open-Llama 65B | 3 | True | True | - | - | 369 | 12 |
| Llama 7B | - | - | - | 1T | 82432 | 3370 | - |
| Llama 13B | - | - | - | 1T | 135168 | 2055 | - |
| Llama 33B | - | - | - | 1.4T | 530432 | 733 | - |
| Llama 65B | - | - | - | 1.4T | 1022362 | 380 | - |
**[2023.4.28] Release v2.0**
本次更新主要包含以下几个方面相对于v1版本提升有效训练速度**50%**其中pad从**30%**减少至**5%**,训练速度从**3200token/s**提升至**3620token/s**。0.95 * 3620/(0.7 * 3200)=1.521
1. 使用Hugging Face的datasets库进行数据读取具体流程如下
1. 使用transform函数将不同数据集的数据统一格式为{'text': 'xxx'}
2. 使用Tokenizer进行分词
3. 对长序列进行采样,目前提供三种模式,分别是:截断/采样(参考[Gopher论文](https://arxiv.org/abs/2112.11446)/切分
4. 可选对来自不同doc的文本进行拼接。减少了数据中的pad加速训练在v1版本中pad占比为**30%**使用拼接后pad占比降低为**5%**。
2. 加入Trainer对于预训练和指令微调都可以复用见solver/trainer.py
3. 统一预训练和指令微调训练入口为train_lm.py
4. 提供更方便的配置可见configs/pretrain_config.yaml
5. 提供基于其他预训练模型补充词表,继续预训练功能
6. 支持从中断点继续训练,包括加载优化器参数/学习率和跳过重复数据
[2023.4.16] Release v1.0
提供基础的预训练和指令微调代码训练速度达到Llama原文速度。预训练和指令微调后的模型已经开源在Hugging Face。
v1版代码可见https://github.com/s-JoL/Open-Llama/tree/v1.0
## **特性**
### 易用性
我们认为易用性是构建大型语言模型时最重要的特性之一。为了使 Open-LLAMA 更加易于使用,我们特别注重了以下几点:
- **最简实现**:我们采用了最简单的实现方式,降低了入门的门槛,让初学者也能轻松上手。
- **流程完整**:我们发布了从数据集构建到训练的完整代码,使得构建一个大语言模型的每一步流程都清晰可见。
### 高性能
由于训练大语言模型的成本高昂,因此在构建大型语言模型时,高性能也是非常重要的。为了实现高性能的训练,我们发布使用了以下技术:
- **Fused CUDA kernel**:使用[xformers](https://github.com/facebookresearch/xformers)中提供的 fused CUDA kernel 可以将多个操作融合在一起,减少了 GPU 和 CPU 之间的数据传输,从而提高了训练效率。
- **并行化训练**:我们使用[Accelerate](https://huggingface.co/docs/accelerate/index)库支持在多个 GPU 上进行并行化训练,以加快训练速度。
对于7B模型使用Transformers中Pytorch原生版本的Llama模型训练训练速度为**1378 token/s/gpu**,使用本代码库训练速度达到**3626 token/s/gpu**,超过[Llama原文](https://arxiv.org/pdf/2302.13971.pdf)中的**3370 token/s/gpu**。
如果使用500B token进行预训练需要训练38300 GPU时。按照Google Cloud上A100-80G Spot的价格计算8卡每小时价格为12.6美元则总价格为60300美元。
当使用未加速版本训练时价格为158744美元。最终降低训练成本9.8万美元。
更多测试可见[和其他开源模型性能对比](https://github.com/s-JoL/Open-Llama#%E5%92%8C%E5%85%B6%E4%BB%96%E5%BC%80%E6%BA%90%E6%A8%A1%E5%9E%8B%E6%80%A7%E8%83%BD%E5%AF%B9%E6%AF%94)。
### 通用性
在训练语言模型时,我们希望能够构建一个通用的模型,可以适用于不同的语言和不同的领域。为了实现这一点,我们采用了以下策略:
- **多语言支持**:我们支持多种语言的语料库,包括英语、中文、日语等多种语言,让用户可以根据自己的需求进行选择。
- **领域通用性**:我们希望模型不仅能在日常问题上能产生帮助,同时希望在专业领域如科学、法律等也能帮助人类。
- **和世界交互**希望通过加入RL使得模型具备和世界交互的能力
## **要求**
- Python 3.7 或更高版本
- PyTorch 1.13
- [Transformers库](https://github.com/huggingface/transformers)
- [Accelerate库](https://huggingface.co/docs/accelerate/index)
- CUDA 11.6 或更高版本(用于 GPU 加速)
- 硬件配置:目前使用(64 CPU, 1000G Memory, 8xA100-80G) x N有个比较神奇的现象当使用更多cpu时反而会慢一点猜测这和dataloader的多进程有一定关系。
## **入门指南**
### 安装
使用下面的命令安装相关依赖
```bash
pip install -r requirements.txt
```
### 数据集准备
目前给出了智源开源的悟道数据集和EleutherAI开源的the pile数据集。数据集下载和处理代码在data目录下。
其中悟道数据集由于需要同意一些协议才能下载因此可能需要修改一下download_wudao中的链接[悟道](https://data.baai.ac.cn/details/WuDaoCorporaText)。
感谢[@skepsun](https://github.com/skepsun)的建议使用scidb下载wudao数据集不需要登陆并且下载更稳定一些。https://github.com/s-JoL/Open-Llama/issues/42
**注意数据下载可能出现失败建议将script中的下载和处理分成两部分来运行可以将下载多运行机会会自动断点续传。**
运行下面的命令进行数据下载并进行分片
```bash
bash data/download_the_pile.sh
bash data/download_wudao.sh
```
数据将按照每个文件最大16384行存储为小文件便于后续使用多进程训练时进行读取。存储格式为jsonl.zst使用zstd进行压缩最终数据大小为519.5G合计16466个文件。
其中the pile数据集包含210607728行json line悟道数据集包含59132213行json line。
具体数据格式如下
```
WuDao
{'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
The Pile
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
```
验证数据完整性可见 [issue](https://github.com/s-JoL/Open-Llama/issues/5)
### 相关工具
在utils目录中提供了训练分词/补充现有分词模型和转换ckpt的代码。
使用SentencePiece训练分词器参考如下命令
```bash
python3 utils/train_tokenizer.py
```
在configs中提供了只使用wudao数据集训练的4w词表的分词模型 4w_cn_vocab_wudao15.model
根据已有分词模型补充词表参考
```bash
python3 utils/merge_tokenizer.py
```
根据META官方的分词模型和上面的4w中文合并为中英文双语的分词模型 llama_tokenizer_extended.model
转换现有的Llama模型ckpt参考
```bash
python3 utils/convert_ckpt.py
```
### 数据读取
数据读取相关代码可见dataset/dataset.py包含了预训练和指令微调数据的处理如需加入其他数据集只需要修改其中的transform函数。
数据读取流程如下:
1. 使用transform函数将不同数据集的数据统一格式为{'text': 'xxx'}
2. 使用Tokenizer进行分词
3. 对长序列进行采样,目前提供三种模式,分别是:截断/采样参考Gopher论文/切分
4. 可选对来自不同doc的文本进行拼接。减少了数据中的pad加速训练在v1版本中pad占比为30%使用拼接后pad占比降低为5%。
使用如下命令查看DataLoader输出的结果并检查分词正确性
```bash
python3 dataset/dataset.py
```
### 模型结构
我们基于Transformers库中的[Llama](https://github.com/facebookresearch/llama)参考论文原文中的2.4 Efficient implementation一节进行了修改
同时还参考了一些其他论文引入了一些优化。具体来说我们引入了由META开源的[xformers库](https://github.com/facebookresearch/xformers)中的memory_efficient_attention操作来进行
Self Attention的计算这对于性能有明显的提升提升大约30%。
具体可以参见[modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/open_llama/modeling_open_llama.py#L229)
同时我们还参考了[Bloom](https://huggingface.co/bigscience/bloom)对于Token Embedding引入了Stable Embedding以更好的稳定训练。
最后我们参考[PALM](https://arxiv.org/abs/2204.02311)使用了Shared Input-Output Embeddings。
### 预训练
我们基于Accelerate库进行多GPU并行训练启动命令如下
```bash
accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/pretrain_config.yaml --model_config configs/model_configs/7B.json
```
某些情况下可能需要指定下列参数
```
--main_process_ip
--main_process_port
--num_processes
--num_machines
--machine_rank
```
我们使用[Wandb](https://wandb.ai/)进行训练的可视化,需要自行修改环境变量 WANDB_API_KEY 。
其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/accelerate_configs。
训练相关超参数可见configs/pretrain_config.yaml
其中默认参数为使用LlamaTokenizer补充4w中文的词表的分词模型模型大小为7B具体配置如下
| max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size |
|------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------|
| 2048 | 2 | 2e-4 | 1e-1 | 7.03B | 4096 | 32 | 32 | 68762 |
```
==============================================================================================================
Layer (type:depth-idx) Output Shape Param #
==============================================================================================================
OpenLlamaForCausalLM [1, 32, 64, 128] --
├─OpenLlamaModel: 1-1 [1, 32, 64, 128] --
│ └─Embedding: 2-1 [1, 64, 4096] 281,649,152
│ └─ModuleList: 2-2 -- --
│ │ └─OpenLlamaDecoderLayer: 3x32 [1, 64, 4096] 202,383,360
│ └─OpenLlamaRMSNorm: 2-3 [1, 64, 4096] 4,096
├─Linear: 1-2 [1, 64, 68762] 281,649,152
==============================================================================================================
Total params: 7,039,569,920
Trainable params: 7,039,569,920
Non-trainable params: 0
Total mult-adds (G): 7.04
```
从头预训练Loss如下
![](assets/pretrain_loss.png)
### Instruction-Tuning
我们使用目前开源的七个数据集进行Instruction-tuning后续会加入更多的任务以及自己构建的数据集。
- [yizhongw/self_instruct](https://huggingface.co/datasets/yizhongw/self_instruct)
- [BelleGroup/train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
- [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
其中ShareGPT_Vicuna_unfiltered数据在datastes的处理有些问题我们直接下载原数据重新进行了处理。
我们对原始数据进行了一些预处理,格式如下
```
user: {prompt}\nsystem: {completion}</s>
```
启动命令和预训练基本一致
```bash
accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/instruct_config.yaml --model_config configs/model_configs/7B.json
```
某些情况下可能需要指定下列参数
```
--main_process_ip
--main_process_port
--num_processes
--num_machines
--machine_rank
```
过程中Loss如下总计使用3个epoch
![loss](assets/instruct_loss.png)
### RLHF
暂无
### Server
多轮对话使用chat_server.py
基于Gradio开发。
## 性能对比
### 训练框架
在训练框架方面我们测试了Hugging Face开源的Accelerate库pytorch-lightning和HPC-AI开源的ColossalAI我们测试在打满显卡时性能差异较小。因此最终选择了实现相对简单的Accelerate库作为训练框架
测试代码可见utils/speed_test.py
测试过程中使用的模型结构为
| Model | n gpu | n layer | n heads | hidden size | vocab size | seq length |
|-------|-------|---------|---------|-------------|------------|------------|
| GPT2 | 2 | 6 | heads | 4096 | 250100 | 1024 |
测试结果如下,可以看到当打满时速度和显存相差不大
| | Hugging Face | Hugging Face | ColossalAI | ColossalAI | ColossalAI |
|-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------|
| config | without activation ckpt, bs2 | without activation ckpt, max_bs=12 | with activation ckpt, bs2 | without activation ckpt, bs2 | without activation ckpt, max_bs=10 |
| second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25 | 0.347 | 0.308, fw=0.067, bw=0.152, opt=0.088 | 1.055 |
| gpu memory | nvidia-smi 45445 | | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps, 疑似有内存泄漏 |
### 性能优化
在最早版本中我们使用DeepSpeed stage2 + Transformers中的原生Llama实现进行训练但是速度和论文中所说的相差较大因此后续我们进行了一系列的优化我们将每一步的性能提升列在下面可供参考。
论文中提到对于6.7B模型使用了1T token进行训练最终的gpu时为82432因此可以计算出他的训练速度大致为3370 token/s/gpu。
当使用下面的优化后速度开源基本和论文中速度一致使用20x8 A100-80G进行测试。预计加入更多融合算子开源取得更好的性能。
| | V1 | V2 |
|---------------------|--------------|------------------------------------|
| Dataset | self implemented | datasets |
| Model | Transformers | Transformers+xformers |
| Optimizer | Pytorch Adam | Fused Adam |
| DeepSpeed | stage2 | stage1 |
| Grad Accumulation | 4 | 12 |
| Return Padding Mask | yes | no |
| Speed token/s/gpu | 1378 | 3637 |
### 和其他开源模型性能对比
下表是一个对目前开源模型性能的一个总结使用GPU device均为A100由于模型大小各不相同结构也有一定差异难以准确的对比性能作为一个粗略估计可以认为速度和模型参数量基本呈反比关系这一点看Llama不同大小的模型可以得到印证。基于这个粗略估计可以看到使用本项目的性能明显由于其他项目。
| Model | Open-Llama | LLAMA | LLAMA | LLAMA | OPT | Bloom | GLM | GPT-NEOX | CPM-ANT | CodeGeeX |
|---------------------|------------|----------|---------|-----------|---------|--------------------|-------|----------|---------|-----------|
| Model size | 7.0B | 6.7B | 13B | 65B | 175B | 175B | 130B | 20B | 10B | 13B |
| Token | | 1T | 1T | 1.4T | 180B | 366B | 400B | 402B | 200B | 13.9B |
| GPU Hour | | 82,432 | 135,168 | 1,022,362 | 809,472 | 1,082,990 | 43776 | 175680 | 47040 | 3072 |
| speed token/s/gpu | 3637 | 3370 | 2055 | 380 | 61.8 | 93.9 | 105.7 | 635.6 | 1181 | 1257 |
| 相关依赖 | xformers | xformers | | | measeq | Megatron-DeepSpeed | | | BMtrain | MindSpore |
| speed token*params B/s/gpu | 25728 | 22579 | 26715 | 24700 | 10815 | 16432 | 13741 | 12712 | 11810 | 16341 |
## 后续计划
1. 加入RLHF代码
2. 使用[Triton](https://github.com/openai/triton)加入更多高性能算子,进一步提升性能
3. 加入根据Common Crawl构建预训练数据集相关代码并开源相关数据集
4. 加入多模态训练代码
## 引用
```
@misc{openllama,
title={Open-Llama},
author={s-JoL},
year={2023},
howpublished={\url{https://github.com/s-JoL/Open-Llama}},
}
```
<p align="center">
<a href="https://star-history.com/#s-JoL/Open-Llama&Date">
<img src="https://api.star-history.com/svg?repos=s-JoL/Open-Llama&type=Date" alt="Star History Chart">
</a>
</p>
<!-- 一些之前没注意到的部分
1. [GPT3](https://arxiv.org/pdf/2005.14165.pdf), Details of Model Training
During training we always train on sequences of the full nctx = 2048 token context window, packing multiple documents into a single sequence when documents are shorter than 2048, in order to increase computational efficiency. Sequences with multiple documents are not masked in any special way but instead documents within a sequence are delimited with a special end of text token, giving the language model the information necessary to infer that context separated by the end of text token is unrelated. This allows for efficient training without need for any special sequence-specific masking.
在[PALM](https://arxiv.org/pdf/2204.02311.pdf)中也有类似的说法
Sequence length A sequence length of 2048 was used for all models. Input examples are concatenated together and then split into sequences of exactly 2048 tokens, so that there are no padding tokens, but examples may be split in the middle. Input examples are differentiated from one another with a special [eod] token.
2. GPT3, Common Crawl Filtering
使用高质量文本作为正例其他所有样本作为负例。根据判为正例的概率作为筛选np.random.pareto(α) > 1 document_score。
思想是尽量使用和高质量样本相似的数据。
The classifier is trained using logistic regression classifier with features from Sparks standard tokenizer and HashingTF.
3. GPT3, fuzzy deduplication
使用MinHashLSH进行去重同时把CC中的WebText部分数据去掉。这些特征和分类器使用的一致。
we fuzzily deduplicated documents (i.e. removed documents with high overlap with other documents) within each dataset using Sparks MinHashLSH implementation with 10 hashes
4. GPT3, Test Set Contamination
5. [The pile](https://arxiv.org/pdf/2101.00027.pdf), BPB(bits per UTF-8 encoded byte)/bits per character/perplexity
$
BPB = = (L_T /L_B)l/ ln(2) \\
perplexity = P(w1, w2, w3, w4, ...)^{-\frac{1}{N}} \\
bpc=-\frac{1}{T}\sum_i log_2 P(w_i|w1, w2, ..., w_{i-1}) \\
2^{bpc}=(\prod_i P(w_i|w1, w2, ..., w_{i-1}))^{-\frac{1}{T}}=perplexity
$
bpc是字符粒度和分词算法相关。而bpb为byte粒度与分词算法无关。
可以使用bpb的差异衡量不同数据的难度。
6. The pile, diversity of the collected data
数据多样性
We hypothesize that this is due to the perplexity based filtering used in CC-100, where a language model is trained on Wikipedia and all data with a perplexity too high or too low is discarded. This effectively discards any data too similar to or too different from Wikipedia, which severely limits the diversity of the collected data.
7. The pile, bytes per token
Since the GPT-2 BPE tokenizer is trained on WebText, the mean bytes per token is also a very rough indicator of how syntactically different each Pile component is from WebText.
8. The pile, Deduplication
We used 10 hash functions for each Minhash and an approximate Jaccard similarity of 0.5.
9. GLM, Embedding Layer Gradient Shrink
和stable embedding类似
$
word-embedding = word-embedding*\alpha+word-embedding.detach() (1\alpha)
$
10. PALM, Training Instability
训练中的loss尖峰是由特定的数据和特定的参数共同造成使用模型回滚+跳过部分数据解决。
Instead, we found that a simple strategy to effectively mitigate the issue: We re-started training from a checkpoint roughly 100 steps before the spike started, and skipped roughly 200500 data batches, which cover the batches that were seen before and during the spike. With this mitigation, the loss did not spike again at the same point. We do not believe that the spikes were caused by “bad data” per se, because we ran several ablation experiments where we took the batches of data that were surrounding the spike, and then trained on those same data batches starting from a different, earlier checkpoint. In these cases, we did not see a spike. This implies that spikes only occur due to the combination of specific data batches with a particular model parameter state
11. [Chinchilla](https://arxiv.org/pdf/2203.15556.pdf), Optimal model scaling
20 tokens per parameter, for example 10B model should use 200B tokens to pretrain
12. [Gopher](https://arxiv.org/pdf/2112.11446.pdf), Quality Filtering
Quality Filtering (MassiveWeb only) The vast majority of text found on the web is of insufficient
quality to be useful for language model training. For example, many web pages contain primarily
automatically generated content, or text that is not intended for human consumption (such as keywords
for search-engine optimisation). Much of the web also comprises social media content, which can
variously lack context, coherence, or substance. To remove low-quality data while minimising potential
for bias, we apply a number of simple, easily understood heuristic filters: we remove any document
that does not contain between 50 and 100,000 words, or whose mean word length is outside the
range of 3 to 10 characters; we remove any document with a symbol-to-word ratio greater than 0.1
for either the hash symbol or the ellipsis; and we remove any document with more than 90% of lines
starting with a bullet point, or more than 30% ending with an ellipsis. We also require that 80%
of words in a document contain at least one alphabetic character, and apply a "stop word" filter, to
remove documents that do not contain at least two of the following English words: the, be, to, of, and,
that, have, with; this adequately deals with ostensibly English documents that contain no coherent
English text.
13. Gopher, Constructing Token Sequences
和GPT3中的避免mask的方法类似
-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 960 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 202 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

After

Width:  |  Height:  |  Size: 105 KiB

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 264 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 859 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 289 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 95 KiB

After

Width:  |  Height:  |  Size: 84 KiB

View File

@ -1,64 +1,30 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-06 22:30:10
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-07 23:03:31
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-12 15:07:36
FilePath: /Open-Llama/chat_server.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import torch
import logging
import gradio as gr
import sentencepiece as spm
from dataset.tokenizer import Tokenizer
from transformers import LlamaForCausalLM, LlamaConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
tokenizer = AutoTokenizer.from_pretrained("s-JoL/Open-Llama-V2", use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
"s-JoL/Open-Llama-V2", torch_dtype=torch.bfloat16, device_map="auto"
)
tokenizer = Tokenizer(sp_model)
raw_model = LlamaForCausalLM(
LlamaConfig(
vocab_size=tokenizer.vocab_size,
initializer_range=0.01,
pad_token_id=tokenizer.pad_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
)
)
ckpt = torch.load(
"data/saved_ckpt/instruction_tuning_math_code_multiturn/36001.pt",
map_location="cpu",
)
raw_model.load_state_dict(ckpt)
raw_model.eval()
model = raw_model.cuda()
print("ready")
def parse_codeblock(text):
lines = text.split("\n")
for i, line in enumerate(lines):
if "```" in line:
if line != "```":
lines[i] = f'<pre><code class="{lines[i][3:]}">'
else:
lines[i] = "</code></pre>"
else:
if i > 0:
lines[i] = "<br/>" + line.replace("<", "&lt;").replace(">", "&gt;")
return "".join(lines)
logging.warning("ready")
with gr.Blocks() as demo:
gr.Markdown(
"""
# [Open-Llama](https://github.com/Bayes-Song/Open-Llama)
# [Open-Llama](https://github.com/s-JoL/Open-Llama)
完全使用Open-Llama项目从0开始训练的Instruct-GPT模型当长时间无响应如20s以上可刷新重试
Instruct-GPT model is trained from scratch using the Open-Llama project without relying on any other pre-trained models. If there is no response for a long time (such as more than 20 seconds), please refresh and try again.
@ -69,7 +35,7 @@ with gr.Blocks() as demo:
clear = gr.Button("Clear")
def user(user_message, history):
print(user_message)
logging.warning(user_message)
return "", history + [[user_message, None]]
def bot(history):
@ -80,22 +46,30 @@ with gr.Blocks() as demo:
if completion is None:
inputs = "user:{}\nsystem:".format(prompt)
inputs = tokenizer(
inputs, return_tensors=True, add_special_tokens=False
inputs,
return_tensors="pt",
add_special_tokens=False,
return_attention_mask=False,
)
context.append(inputs["input_ids"])
else:
inputs = "user:{}\nsystem:{}".format(prompt, completion)
inputs = tokenizer(inputs, return_tensors=True, add_special_tokens=True)
inputs = tokenizer(
inputs,
return_tensors="pt",
add_special_tokens=True,
return_attention_mask=False,
)
context.append(inputs["input_ids"])
context = torch.cat(context, dim=-1)
context = context[:, -1024:]
inputs_len = context.shape[1]
context = context.cuda()
pred = model.generate(input_ids=context, max_new_tokens=512, do_sample=True)
pred = model.generate(input_ids=context, max_new_tokens=1024, do_sample=True)
pred = pred[:, inputs_len:]
pred = tokenizer.decode(pred.cpu())[0]
print(pred)
bot_message = parse_codeblock(pred)
pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
logging.warning(pred)
bot_message = pred
history[-1][1] = bot_message
return history
@ -108,8 +82,9 @@ with gr.Blocks() as demo:
当前体验服务生成的所有内容都是由人工智能模型生成我们对其生成内容的准确性完整性和功能性不做任何保证并且其生成的内容不代表我们的态度或观点
联系方式: sl12160010@gmail.com 对于该项目有任何意见和建议都欢迎联系我.
Contact information: sl12160010@gmail.com. Any opinions or suggestions regarding the project are welcome to be addressed to me through this email.
"""
)
demo.launch(share=True)
demo.launch()

Binary file not shown.

View File

@ -1,30 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 12
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 1
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_backend: 'no'
# dynamo_config:
# dynamo_backend: INDUCTOR
# dynamo_mode: default
# dynamo_use_dynamic: true
# dynamo_use_fullgraph: false
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
use_cpu: false

View File

@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false

View File

@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero_stage: 3
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false

View File

@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: cpu
offload_param_device: cpu
zero3_init_flag: true
zero_stage: 3
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false

View File

@ -0,0 +1,32 @@
data:
mode: "instruct"
data:
mixed: "data/instruction_data/part-*.jsonl.zst"
pad_to_max: False
sequence_sample_mode: "none"
concat_multiple_sequence: True
num_sequences: 50
seq_length: 2048
tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
split_by_shard: False
train:
train_batch_size: 2
# 1B token for 1 epoch, 5epoch
num_training_steps: 20000
num_warmup_steps: 500
initializer_range: 1.0e-2
lr: 2.0e-4
weight_decay: 1.0e-1
ckpt: "data/saved_model/ckpt.pth"
train_num_workers: 16
gradient_accumulation_steps: 1
prefetch_factor: 100
train_and_eval: False
gradient_checkpointing_enable: False
use_lora: False
# global step
log_interval: 50
eval_interval: 500
save_interval: 1000
work_dir: "data/saved_ckpt/7B_instruction"
project_name: "Llama Instruction"

View File

@ -1,25 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-30 21:38:07
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-06 03:37:23
FilePath: /Open-Llama/configs/instruction_tuning_config.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
max_length = 1024
train_batch_size = 2
num_training_steps = 40000
num_warmup_steps = 100
initializer_range = 1e-2
lr = 2e-4
weight_decay = 1e-1
tokenizer_model_path = "configs/10w_vocab_wudao5_pile10.model"
patterns = ["data/instruction_data/part-*.jsonl.zst"]
# global step
log_interval = 50
eval_interval = 500
save_interval = 1000
work_dir = "data/saved_ckpt/"
ckpt_path = "data/saved_ckpt/83200.pt"

View File

@ -0,0 +1,26 @@
{
"architectures": [
"OpenLlamaForCausalLM"
],
"attention_dropout_prob": 0.1,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_dropout_prob": 0.1,
"hidden_size": 5120,
"initializer_range": 1e-2,
"intermediate_size": 13824,
"max_position_embeddings": 2048,
"model_type": "open-llama",
"num_attention_heads": 40,
"num_hidden_layers": 40,
"pad_token_id": 32000,
"rms_norm_eps": 1e-05,
"shared_input_output_embedding": false,
"tie_word_embeddings": false,
"torch_dtype": "float32",
"use_cache": true,
"use_memorry_efficient_attention": true,
"use_stable_embedding": false,
"vocab_size": 68762
}

View File

@ -0,0 +1,26 @@
{
"architectures": [
"OpenLlamaForCausalLM"
],
"attention_dropout_prob": 0.1,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_dropout_prob": 0.1,
"hidden_size": 6656,
"initializer_range": 1e-2,
"intermediate_size": 17920,
"max_position_embeddings": 2048,
"model_type": "open-llama",
"num_attention_heads": 52,
"num_hidden_layers": 60,
"pad_token_id": 32000,
"rms_norm_eps": 1e-05,
"shared_input_output_embedding": false,
"tie_word_embeddings": false,
"torch_dtype": "float32",
"use_cache": true,
"use_memorry_efficient_attention": true,
"use_stable_embedding": false,
"vocab_size": 68762
}

View File

@ -0,0 +1,26 @@
{
"architectures": [
"OpenLlamaForCausalLM"
],
"attention_dropout_prob": 0.1,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_dropout_prob": 0.1,
"hidden_size": 8192,
"initializer_range": 1e-2,
"intermediate_size": 22016,
"max_position_embeddings": 2048,
"model_type": "open-llama",
"num_attention_heads": 64,
"num_hidden_layers": 80,
"pad_token_id": 32000,
"rms_norm_eps": 1e-05,
"shared_input_output_embedding": false,
"tie_word_embeddings": false,
"torch_dtype": "float32",
"use_cache": true,
"use_memorry_efficient_attention": true,
"use_stable_embedding": false,
"vocab_size": 68762
}

View File

@ -0,0 +1,26 @@
{
"architectures": [
"OpenLlamaForCausalLM"
],
"attention_dropout_prob": 0.1,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_dropout_prob": 0.1,
"hidden_size": 4096,
"initializer_range": 1e-2,
"intermediate_size": 11008,
"max_position_embeddings": 2048,
"model_type": "open-llama",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"pad_token_id": 32000,
"rms_norm_eps": 1e-05,
"shared_input_output_embedding": false,
"tie_word_embeddings": false,
"torch_dtype": "float32",
"use_cache": true,
"use_memorry_efficient_attention": true,
"use_stable_embedding": false,
"vocab_size": 68762
}

View File

@ -1,14 +0,0 @@
max_length = 1024
train_batch_size = 2
num_training_steps = 1000000
num_warmup_steps = 2000
initializer_range = 1e-2
lr = 2e-4
weight_decay = 1e-1
tokenizer_model_path = "configs/10w_vocab_wudao5_pile10.model"
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
# global step
log_interval = 5
eval_interval = 200
save_interval = 800
work_dir = "data/saved_ckpt/"

View File

@ -0,0 +1,32 @@
data:
mode: "pretrain"
data:
wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
pad_to_max: False
sequence_sample_mode: "none"
concat_multiple_sequence: True
num_sequences: 10
seq_length: 2048
tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model"
split_by_shard: False
train:
train_batch_size: 2
num_training_steps: 500000
num_warmup_steps: 2000
initializer_range: 1.0e-2
lr: 2.0e-4
weight_decay: 1.0e-1
ckpt: null
train_num_workers: 16
gradient_accumulation_steps: 12
prefetch_factor: 100
train_and_eval: False
gradient_checkpointing_enable: False
use_lora: False
# global step
log_interval: 5
eval_interval: 500
save_interval: 1000
work_dir: "data/saved_ckpt/7B"
project_name: "Llama Pretrain"

Binary file not shown.

View File

@ -1,15 +1,15 @@
#!/bin/bash
###
# @Author: LiangSong(sl12160010@gmail.com)
# @Author: s-JoL(sl12160010@gmail.com)
# @Date: 2023-04-05 23:18:10
# @LastEditors: LiangSong(sl12160010@gmail.com)
# @LastEditTime: 2023-04-05 23:34:30
# @LastEditors: s-JoL(sl12160010@gmail.com)
# @LastEditTime: 2023-05-04 08:24:17
# @FilePath: /Open-Llama/data/download_instruct.sh
# @Description:
#
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
# Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
###
mkdir data/instruction_data
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json
wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json' -O data/sg_90k_part1_html_cleaned.json
wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json' -O data/sg_90k_part2_html_cleaned.json
python3 data/preprocess_instruction.py

View File

@ -1,22 +1,23 @@
#!/bin/bash
###
# @Author: LiangSong(sl12160010@gmail.com)
# @Author: s-JoL(sl12160010@gmail.com)
# @Date: 2023-03-16 21:21:38
# @LastEditors: LiangSong(sl12160010@gmail.com)
# @LastEditors: s-JoL(sl12160010@gmail.com)
# @LastEditTime: 2023-03-26 22:58:02
# @FilePath: /Open-Llama/data/download_the_pile.sh
# @Description:
# download the pile dataset and preprocess
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
# Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
###
start=0
end=29
mkdir data/the_pile
for (( i=$start; i<=$end; i++ ))
do
url="https://the-eye.eu/public/AI/pile/train/$(printf "%02d" $i).jsonl.zst"
url="https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/$(printf "%02d" $i).jsonl.zst?download=true"
echo "Downloading file: $url"
curl -C - $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst"
#curl -C - $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst"
curl -L $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst"
done
wait

View File

@ -1,19 +1,24 @@
#!/bin/bash
###
# @Author: LiangSong(sl12160010@gmail.com)
# @Author: s-JoL(sl12160010@gmail.com)
# @Date: 2023-03-16 21:21:56
# @LastEditors: LiangSong(sl12160010@gmail.com)
# @LastEditors: s-JoL(sl12160010@gmail.com)
# @LastEditTime: 2023-03-26 22:58:11
# @FilePath: /Open-Llama/data/download_wudao.sh
# @Description:
# download wudao dataset and preprocess
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
# Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
###
apt-add-repository multiverse
apt install unrar
for i in {1..100}
do
curl -C - --retry 100 'https://dorc.baai.ac.cn/resources/data/WuDaoCorpora2.0/WuDaoCorpus2.0_base_200G.rar?AccessKeyId=AKLTNasiLRBBTcOgPqzlkPzu1w&Expires=1679127659&Signature=7jh%2FpnJyC2hAeumm9EjaeE5HN9E%3D' -o data/WuDaoCorpus2.0_base_200G.rar
done
unrar x data/WuDaoCorpus2.0_base_200G.rar
wget -v -c 'https://download.scidb.cn/download?fileId=63a30383fed6a8a9e8454302&dataSetType=organization&fileName=WuDaoCorporaText-2.0-open.rar' -O data/WuDaoCorpus2.0_base_200G.rar
# for i in {1..100}
# do
# curl -C - --retry 100 'https://dorc.baai.ac.cn/resources/data/WuDaoCorpora2.0/WuDaoCorpus2.0_base_200G.rar?AccessKeyId=AKLTNasiLRBBTcOgPqzlkPzu1w&Expires=1679127659&Signature=7jh%2FpnJyC2hAeumm9EjaeE5HN9E%3D' -o data/WuDaoCorpus2.0_base_200G.rar
# done
unrar x data/WuDaoCorpus2.0_base_200G.rar data/
mkdir data/pretrain_data
python3 data/preprocess_wudao.py

View File

@ -1,167 +1,90 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-03-30 20:52:10
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-05 23:51:16
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-04 08:32:04
FilePath: /Open-Llama/data/preprocess_instruction.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from tqdm import tqdm
import zstandard as zstd
from datasets import load_dataset
root_dir = "data"
write_path = "data/instruction_data/part-{}-{}.jsonl.zst"
dataset_map = {
"yizhongw/self_instruct": "self_instruct",
"BelleGroup/train_0.5M_CN": "belle_0.5M",
"BelleGroup/train_1M_CN": "belle_1M",
"BelleGroup/train_2M_CN": "belle_2M",
"BelleGroup/school_math_0.25M": "belle_school_math_0.25M",
"BelleGroup/multiturn_chat_0.8M": "belle_multiturn_chat_0.8M",
"Graverman/Instruct-to-Code": "instruct_to_code",
"qwedsacf/grade-school-math-instructions": "grade_school_math",
"camel-ai/math": "camel_ai_math",
"camel-ai/physics": "camel_ai_physics",
"camel-ai/chemistry": "camel_ai_chemistry",
"camel-ai/biology": "camel_ai_biology",
("bigscience/xP3mt", "code"): "xP3mt_code",
("bigscience/xP3mt", "zh"): "xP3mt_zh",
}
dataset = load_dataset("yizhongw/self_instruct")
write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"yizhongw/self_instruct preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
def process_hf_dataset(name, local_name):
if isinstance(name, str):
dataset = load_dataset(name)
else:
dataset = load_dataset(*name)
total_num = 0
file_num = 1
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
for line in tqdm(dataset["train"]):
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(
write_path.format(local_name, file_num), "wb", encoding="utf-8"
)
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"{} preprocess done. Total line: {}, Total file: {}".format(
name, total_num, file_num
)
)
)
dataset = load_dataset("BelleGroup/train_0.5M_CN")
write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/train_0.5M_CN preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("BelleGroup/train_1M_CN")
write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/train_1M_CN preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
for k, v in dataset_map.items():
process_hf_dataset(k, v)
dataset = load_dataset("BelleGroup/school_math_0.25M")
write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
local_name = "sharegpt_90K"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/school_math_0.25M preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"BelleGroup/multiturn_chat_0.8M preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
dataset = load_dataset("Graverman/Instruct-to-Code")
write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"Graverman/Instruct-to-Code preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)
write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
total_num = 0
file_num = 0
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
with open("data/sg_90k_part1.json", "r") as fp:
file_num = 1
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp:
data1 = json.load(fp)
with open("data/sg_90k_part2.json", "r") as fp:
with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp:
data2 = json.load(fp)
data = data1 + data2
for line in data:
for line in tqdm(data):
line = json.dumps(line)
if total_num % 1024 == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8")
wfp.write(line.encode("utf-8"))
wfp.write(b"\n")
total_num += 1
wfp.close()
print(
"RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format(
"anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format(
total_num, file_num
)
)

View File

@ -1,13 +1,13 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-03-16 22:35:38
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-03-26 22:59:38
FilePath: /Open-Llama/data/preprocess_the_pile.py
Description:
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
making it easy for parallel training to perform streaming reads.
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from glob import glob
@ -17,7 +17,7 @@ import zstandard as zstd
paths = glob("data/the_pile/*.jsonl.zst")
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
with zstd.open(path, "r", encoding="utf-8") as fp:

View File

@ -1,13 +1,13 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-03-16 22:10:44
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-03-26 22:59:55
FilePath: /Open-Llama/data/preprocess_wudao.py
Description:
Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines,
making it easy for parallel training to perform streaming reads.
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from glob import glob
@ -17,7 +17,7 @@ import zstandard as zstd
paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
with open(path, "r") as fp:

View File

@ -1,69 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-30 20:58:16
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-05 22:11:03
FilePath: /Open-Llama/dataset/collate_fn.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import torch
def collate_fn_gen(tokenizer, segment_max_length=1024, padding="longest"):
"""
Organize data into tensors by padding based on the preset maximum length.
"""
pad_id = tokenizer.pad_id
def collate_fn(batch):
if padding == "longest":
max_length = max([len(i) for i in batch])
elif padding == "max_length":
max_length = segment_max_length
else:
raise Exception("Invalid argumet for padding: {}".format(padding))
input_ids = []
for i in batch:
input_len = len(i)
input_ids.append(i + [pad_id] * (max_length - input_len))
inputs = {
"input_ids": torch.tensor(input_ids, dtype=torch.int64),
}
return inputs
return collate_fn
if __name__ == "__main__":
import sentencepiece as spm
from torch.utils.data import DataLoader
from dataset.pretrain_dataset import preprocess_wudao_gen, preprocess_the_pile_gen
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, DataIter
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
transform_dict = {
"wudao": preprocess_wudao_gen(tokenizer),
"pile": preprocess_the_pile_gen(tokenizer),
}
data_set = DataIter(paths, transform_dict=transform_dict)
train_loader = DataLoader(
data_set,
batch_size=8,
num_workers=4,
collate_fn=collate_fn_gen(tokenizer),
drop_last=True,
)
for batch in train_loader:
for k, v in batch.items():
print(k, v.shape)
break

View File

@ -1,116 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 19:32:20
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-06 03:37:55
FilePath: /Open-Llama/dataset/data_iter.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import json
from glob import glob
import zstandard as zstd
from torch.utils.data import IterableDataset
class DataIter(IterableDataset):
"""
Currently, the allowed storage formats are jsonl.zst.
Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
Currently, only single worker is supported.
"""
def __init__(
self,
paths_with_index,
transform_dict=None,
max_length=None,
concat_docs=False,
process_index=0,
num_processes=1,
):
super().__init__()
self.paths_with_index = paths_with_index
self.max_length = max_length
self.transform_dict = transform_dict
self.concat_docs = concat_docs
self.process_index = process_index
self.num_processes = num_processes
if self.concat_docs:
self.cache = []
def __iter__(self):
past = None
for i, path in self.paths_with_index:
# part-dataset_name-01.jsonl.zst
dataset_name = path.split("-")[-2]
# shard to multiple device
if self.num_processes > 1 and i % self.num_processes != self.process_index:
continue
# Log the file name when encountering a new file.
if past != dataset_name:
print("Loading data from {}".format(path))
past = path
# Currently, the allowed storage formats are jsonl.zst.
assert path.endswith("jsonl.zst")
with zstd.open(path, "r", encoding="utf-8") as fp:
for line in fp:
# If the length of the cache is greater than max_length.
if self.concat_docs and len(self.cache) >= self.max_length:
seq = self.cache[: self.max_length]
self.cache = self.cache[self.max_length :]
yield seq
if isinstance(line, bytes):
line = line.decode("utf-8")
line = json.loads(line)
line["dataset"] = dataset_name
# Transformation, including sample, tokenize, etc.
if self.transform_dict:
line = self.transform_dict[dataset_name](line)
# skip bad doc
if line is None:
continue
elif isinstance(line, str):
yield line
# must be list of list
elif isinstance(line, list) and isinstance(line[0], list):
for seq in line:
if self.concat_docs:
# concat seq from multiple docs
self.cache += seq
else:
yield seq
else:
raise Exception(
"Unsupported type in Transformation: {}".format(
self.transform_dict[dataset_name]
)
)
else:
yield line
def create_shard_kwargs(patterns, repeat=1):
"""
Assign numbers to different shards of data to ensure that data is not duplicated
when allocated to different nodes during distributed training.
"""
all_path = []
for p in patterns:
all_path.extend(glob(p))
all_path *= repeat
return [(i, p) for i, p in enumerate(all_path)]
if __name__ == "__main__":
patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
transform_dict = {"wudao": lambda x: x["title"], "pile": lambda x: [x["text"]]}
data_iter = DataIter(
paths, transform_dict=transform_dict, max_length=16, concat_docs=True
)
for i, data in enumerate(data_iter):
print(i, data)
if i == 20:
break

321
dataset/dataset.py Normal file
View File

@ -0,0 +1,321 @@
"""
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-24 20:05:21
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-06 23:30:37
FilePath: /Open-Llama/dataset/dataset.py
Description:
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import math
import torch
import random
from glob import glob
from datasets import load_dataset
random.seed(42)
def pretrain_transform(batch):
# wudao preprocess
if "title" in batch and "content" in batch:
assert len(batch["title"]) == 1
batch["text"] = [batch["title"][0] + "\n" + batch["content"][0]]
elif "text" in batch:
pass
else:
raise Exception("Unrecognized pretrain dataset format.")
return batch
def instruct_transform(batch):
# self instruct preprocess
if "prompt" in batch and "completion" in batch:
prompt = batch["prompt"][0]
completion = batch["completion"][0]
if prompt.endswith("Output:"):
prompt = prompt[:-7]
text = "user:{}\nsystem:{}".format(prompt.strip(), completion.strip())
texts = [text]
# belle preprocess
elif "instruction" in batch and "output" in batch:
prompt = batch["instruction"][0].replace("\\n", "")
prompt = prompt.strip("")
completion = batch["output"][0].replace("\\n", "")
completion = completion.strip("")
# multi turn chat
if "Human:" in prompt:
texts = []
chats = prompt + completion
chats = chats.split("Human:")
for chat in chats:
if chat.strip() == "":
continue
res = chat.split("Assistant:")
if len(res) != 2:
continue
prompt, completion = res
prompt = prompt.strip()
completion = completion.strip()
chat = "user:{}\nsystem:{}".format(prompt, completion)
texts.append(chat)
texts = ["[multiturn_sep]".join(texts)]
else:
text = "user:{}\nsystem:{}".format(prompt, completion)
texts = [text]
# instruct code preprocess
elif "instruction" in batch and "answer" in batch:
prompt = batch["instruction"][0].replace("\\n", "")
prompt = prompt.strip("")
completion = batch["answer"][0].replace("\\n", "")
completion = completion.strip("")
text = "user:{}\nsystem:{}".format(prompt, completion)
texts = [text]
# share gpt preprocess
elif "conversations" in batch:
chats = batch["conversations"][0]
if chats[0]["from"] != "human":
chats = chats[1:]
texts = []
for i in range(len(chats) // 2):
prompt = chats[2 * i]
completion = chats[2 * i + 1]
if not (prompt["from"] == "human" and completion["from"] == "gpt"):
continue
prompt = prompt["value"]
prompt = prompt.strip()
completion = completion["value"]
completion = completion.strip()
chat = "user:{}\nsystem:{}".format(prompt, completion)
texts.append(chat)
texts = ["[multiturn_sep]".join(texts)]
# xP3 preprocess
elif "inputs" in batch and "targets" in batch:
inputs = batch["inputs"][0]
targets = batch["targets"][0]
text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip())
texts = [text]
# camel-ai preprocess
elif "message_1" in batch and "message_2" in batch:
inputs = batch["message_1"][0]
targets = batch["message_2"][0]
text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip())
texts = [text]
# grade-school-math-instructions preprocess
elif "INSTRUCTION" in batch and "RESPONSE" in batch:
inputs = batch["INSTRUCTION"][0]
targets = batch["RESPONSE"][0]
text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip())
texts = [text]
else:
raise Exception("Unrecognized instruct dataset format.")
return {"text": texts}
def split_multiturn(batch):
return {"text": batch["text"][0].split("[multiturn_sep]")}
def sample_sequence_gen(seq_length, eos_token_id):
def sample_sequence(line):
doc_length = line["input_ids"].shape[0]
if doc_length <= seq_length:
start = 0
else:
if random.random() < 1 / 4:
start = 0
else:
start = random.randint(0, doc_length - seq_length)
input_ids = line["input_ids"][start : start + seq_length]
if input_ids[-1] != eos_token_id:
input_ids[-1] = eos_token_id
return {"input_ids": input_ids}
return sample_sequence
def split_sequence_gen(seq_length):
def split_sequence(batch):
input_ids = batch["input_ids"][0]
out = []
while len(input_ids) >= (1 + len(out)) * seq_length:
out.append(input_ids[len(out) * seq_length : (1 + len(out)) * seq_length])
return {"input_ids": out}
return split_sequence
def concat_multiple_sequence_gen(seq_length, pad_token_id):
def concat_multiple_sequence(batch):
concat_input_ids = torch.cat(batch["input_ids"], dim=0)
length = concat_input_ids.shape[0]
chunks = math.ceil(length / seq_length)
pad_length = chunks * seq_length - length
pad = torch.ones(pad_length, dtype=concat_input_ids.dtype) * pad_token_id
concat_input_ids = torch.cat([concat_input_ids, pad], dim=0)
input_ids = torch.chunk(concat_input_ids, chunks)
return {"input_ids": input_ids}
return concat_multiple_sequence
def get_labels_gen(pad_token_id):
def get_labels(line):
input_ids = line["input_ids"]
labels = input_ids.clone()
labels[labels == pad_token_id] = -100
return {"labels": labels}
return get_labels
def construct_dataset(
dataset_config, tokenizer, return_raw_text=False, world_size=None
):
all_data_files = []
for name, pattern in dataset_config["data"].items():
data_files = glob(pattern)
assert len(data_files) > 0
all_data_files.extend(data_files)
random.shuffle(all_data_files)
# 当shard可以被world_size整除时 split_dataset_by_node 会直接按shard进行划分否则会读所有数据然后跳过一部分可能会慢一点
# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.distributed.split_dataset_by_node
if world_size is not None:
num_shards = len(all_data_files)
all_data_files = all_data_files[: num_shards // world_size * world_size]
dataset = load_dataset(
"json", data_files=all_data_files, split="train", streaming=True
)
# shuffle
dataset = dataset.shuffle(seed=42)
# 文本预处理转换为统一格式
if dataset_config["mode"] == "pretrain":
dataset = dataset.map(pretrain_transform, batched=True, batch_size=1)
elif dataset_config["mode"] == "instruct":
dataset = dataset.map(instruct_transform, batched=True, batch_size=1)
dataset = dataset.select_columns("text")
dataset = dataset.map(split_multiturn, batched=True, batch_size=1)
else:
raise Exception("Dataset mode: {} not found.".format(dataset_config["mode"]))
full_dataset = dataset
# to visualize
if return_raw_text:
return full_dataset
seq_length = dataset_config["seq_length"]
pad_to_max = dataset_config.get("pad_to_max", True)
sequence_sample_mode = dataset_config.get("sequence_sample_mode", "truncation")
truncation = sequence_sample_mode == "truncation"
concat_multiple_sequence = dataset_config.get("concat_multiple_sequence", False)
# tokenize
if pad_to_max:
full_dataset = full_dataset.map(
lambda x: tokenizer(
x["text"],
return_tensors="pt",
return_attention_mask=False,
padding="max_length",
max_length=seq_length,
truncation=truncation,
)
)
else:
full_dataset = full_dataset.map(
lambda x: tokenizer(
x["text"],
return_tensors="pt",
return_attention_mask=False,
truncation=truncation,
)
)
# format
full_dataset = full_dataset.map(lambda x: {"input_ids": x["input_ids"][0]})
full_dataset = full_dataset.select_columns("input_ids")
# sequence_sample
if sequence_sample_mode == "truncation":
pass
elif sequence_sample_mode == "none":
pass
elif sequence_sample_mode == "sample":
assert pad_to_max or concat_multiple_sequence
full_dataset = full_dataset.map(
sample_sequence_gen(seq_length, tokenizer.eos_token_id)
)
elif sequence_sample_mode == "split":
assert not concat_multiple_sequence
full_dataset = full_dataset.map(
split_sequence_gen(seq_length), batched=True, batch_size=1
)
else:
raise Exception(
"Unknown sequence_sample mode: {}.".format(sequence_sample_mode)
)
# concat multiple sequence
if concat_multiple_sequence:
num_sequences = dataset_config["num_sequences"]
full_dataset = full_dataset.map(
concat_multiple_sequence_gen(seq_length, tokenizer.pad_token_id),
batched=True,
batch_size=num_sequences,
drop_last_batch=True,
)
# add label
full_dataset = full_dataset.map(get_labels_gen(tokenizer.pad_token_id))
# shuffle
full_dataset = full_dataset.shuffle(seed=42)
return full_dataset
if __name__ == "__main__":
import time
from unicodedata import normalize
from torch.utils.data import DataLoader
from transformers import LlamaTokenizer
data_config = {
"mode": "pretrain",
"data": {"mixed": "data/pretrain_data/part-*.jsonl.zst"},
"pad_to_max": False,
"sequence_sample_mode": "sample",
"concat_multiple_sequence": True,
"num_sequences": 10,
"seq_length": 2048,
}
tokenizer = LlamaTokenizer(
"configs/tokenizer_models/llama_tokenizer_extended.model",
pad_token="<pad>",
add_bos_token=False,
add_eos_token=True,
)
pretrain_dataset = construct_dataset(data_config, tokenizer, True)
start = time.time()
for i, line in enumerate(pretrain_dataset):
raw_text = line["text"]
# raw_text = normalize("NFKC", raw_text)
input_ids = tokenizer(
line["text"], return_tensors="pt", return_attention_mask=False
)["input_ids"][0]
decode_text = tokenizer.decode(input_ids, skip_special_tokens=True)
if raw_text != decode_text and "" not in raw_text:
print(raw_text, "\n", decode_text)
if i == 3000:
break
print("all checked in {} seconds.".format(time.time() - start))
pretrain_dataset = construct_dataset(data_config, tokenizer)
print(pretrain_dataset.n_shards)
pretrain_loader = DataLoader(pretrain_dataset, batch_size=2, num_workers=16)
for batch in pretrain_loader:
for k, v in batch.items():
print(k, v.shape, "\n", v)
break

View File

@ -1,178 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-30 21:02:00
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-06 03:33:27
FilePath: /Open-Llama/dataset/instruction_dataset.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import math
def preprocess_self_instruction_gen(tokenizer, segment_max_length=1024):
def preprocess_self_instruction(line):
"""
The format of the data is roughly as follows.
{'prompt': 'Explain the origin of life on earth. Output:', 'completion': 'Life on Earth is believed to have'}
Split the data based on the tokenized length according to the maximum length.
"""
prompt = line["prompt"]
if prompt.endswith("Output:"):
prompt = prompt[:-7]
total = "user:{}\nsystem:{}".format(prompt.strip(), line["completion"].strip())
out = tokenizer(total)
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_self_instruction
def preprocess_belle_gen(tokenizer, segment_max_length=1024):
def preprocess_belle(line):
"""
The format of the data is roughly as follows.
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
Split the data based on the tokenized length according to the maximum length.
"""
prompt = line["instruction"].replace("\\n", "")
prompt = prompt.strip("")
completion = line["output"].replace("\\n", "")
completion = completion.strip("")
total = "user:{}\nsystem:{}".format(prompt, completion)
out = tokenizer(total)
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_belle
def preprocess_belle_multiturn_chat_gen(tokenizer, segment_max_length=1024):
def preprocess_belle_multiturn_chat(line):
"""
The format of the data is roughly as follows.
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
Split the data based on the tokenized length according to the maximum length.
"""
prompt = line["instruction"].replace("\\n", "")
prompt = prompt.strip("")
completion = line["output"].replace("\\n", "")
completion = completion.strip("")
chats = prompt + completion
chats = chats.split("Human:")
input_ids = []
for chat in chats:
if chat.strip() == "":
continue
res = chat.split("Assistant:")
if len(res) != 2:
continue
prompt, completion = res
prompt = prompt.strip()
completion = completion.strip()
chat = "user:{}\nsystem:{}".format(prompt, completion)
out = tokenizer(chat)
input_ids.extend(out["input_ids"])
if len(input_ids) == 0:
return None
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_belle_multiturn_chat
def preprocess_sharegpt_gen(tokenizer, segment_max_length=1024):
def preprocess_sharegpt(line):
"""
The format of the data is roughly as follows.
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
Split the data based on the tokenized length according to the maximum length.
"""
chats = line["conversations"]
if chats[0]["from"] != "human":
chats = chats[1:]
input_ids = []
for i in range(len(chats) // 2):
prompt = chats[2 * i]
completion = chats[2 * i + 1]
if not (prompt["from"] == "human" and completion["from"] == "gpt"):
continue
prompt = prompt["value"]
prompt = prompt.strip()
completion = completion["value"]
completion = completion.strip()
chat = "user:{}\nsystem:{}".format(prompt, completion)
out = tokenizer(chat)
input_ids.extend(out["input_ids"])
if input_ids == []:
return None
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_sharegpt
def preprocess_instruct_code_gen(tokenizer, segment_max_length=1024):
def preprocess_instruct_code(line):
"""
The format of the data is roughly as follows.
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
Split the data based on the tokenized length according to the maximum length.
"""
prompt = line["instruction"].replace("\\n", "")
prompt = prompt.strip("")
completion = line["answer"].replace("\\n", "")
completion = completion.strip("")
total = "user:{}\nsystem:{}".format(prompt, completion)
out = tokenizer(total)
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_instruct_code
if __name__ == "__main__":
import sentencepiece as spm
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, DataIter
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
patterns = ["data/instruction_data/part-belle_multiturn_chat_0.8M-*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
transform_dict = {
"self_instruct": preprocess_self_instruction_gen(tokenizer),
"belle_1M": preprocess_belle_gen(tokenizer),
"belle_0.5M": preprocess_belle_gen(tokenizer),
"belle_school_math_0.25M": preprocess_belle_gen(tokenizer),
"belle_multiturn_chat_0.8M": preprocess_belle_multiturn_chat_gen(tokenizer),
"instruct_to_code": preprocess_instruct_code_gen(tokenizer),
"sharegpt_90K": preprocess_sharegpt_gen(tokenizer),
}
data_set = DataIter(
paths, transform_dict=transform_dict, concat_docs=True, max_length=1024
)
for i, sample in enumerate(data_set):
print(sp_model.decode(sample))
if i == 1:
break

View File

@ -1,71 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 20:41:25
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-05 22:32:39
FilePath: /Open-Llama/dataset/pretrain_dataset.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import math
def preprocess_wudao_gen(tokenizer, segment_max_length=1024):
def preprocess_wudao(line):
"""
The format of the data is roughly as follows.
{'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'}
Split the data based on the tokenized length according to the maximum length.
"""
total = line["title"] + "\n" + line["content"]
out = tokenizer(total)
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_wudao
def preprocess_the_pile_gen(tokenizer, segment_max_length=1024):
def preprocess_the_pile(line):
"""
The format of the data is roughly as follows.
{'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
Split the data based on the tokenized length according to the maximum length.
"""
total = line["text"]
out = tokenizer(total)
input_ids = out["input_ids"]
return [
input_ids[i * segment_max_length : (i + 1) * segment_max_length]
for i in range(math.ceil(len(input_ids) / segment_max_length))
]
return preprocess_the_pile
if __name__ == "__main__":
import sentencepiece as spm
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, DataIter
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
patterns = ["data/pretrain_data/part-*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
transform_dict = {
"wudao": preprocess_wudao_gen(tokenizer),
"pile": preprocess_the_pile_gen(tokenizer),
}
data_set = DataIter(
paths, transform_dict=transform_dict, concat_docs=True, max_length=1024
)
for sample in data_set:
print(sample)
break

View File

@ -1,218 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-20 21:39:47
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-06 23:01:50
FilePath: /Open-Llama/dataset/tokenizer.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import torch
class Tokenizer:
def __init__(self, sp_model):
self.sp_model = sp_model
self.bos_id = self.sp_model.bos_id()
self.eos_id = self.sp_model.eos_id()
self.pad_id = self.sp_model.pad_id()
self.vocab_size = self.sp_model.vocab_size()
def __call__(
self,
inputs,
padding=None,
max_length=256,
return_tensors=False,
truncation=False,
add_special_tokens=True,
return_mask=False,
):
if isinstance(inputs, str):
return self.encode(
inputs,
padding=padding,
max_length=max_length,
return_tensors=return_tensors,
truncation=truncation,
add_special_tokens=add_special_tokens,
return_mask=return_mask,
)
else:
return self.encode_batch(
inputs,
padding=padding,
max_length=max_length,
return_tensors=return_tensors,
truncation=truncation,
add_special_tokens=add_special_tokens,
return_mask=return_mask,
)
def encode(
self,
inputs,
padding=None,
max_length=8192,
return_tensors=False,
truncation=False,
add_special_tokens=True,
return_mask=False,
):
assert isinstance(inputs, str)
input_ids = self.sp_model.Encode(inputs)
if return_mask:
attention_mask = [1] * len(input_ids)
if truncation:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L780
# 参考Transformer中的实现 默认最后一位一定是pad或者eos
input_ids = input_ids[: max_length - 1]
if return_mask:
attention_mask = attention_mask[: max_length - 1]
if add_special_tokens:
input_ids = input_ids + [self.eos_id]
if return_mask:
attention_mask = attention_mask + [0]
if padding == "max_length":
input_ids = input_ids + [self.pad_id] * (max_length - len(input_ids))
if return_mask:
attention_mask = attention_mask + [0] * (
max_length - len(attention_mask)
)
if return_tensors:
input_ids = torch.tensor([input_ids])
out = {
"input_ids": input_ids,
}
if return_mask:
attention_mask = torch.tensor([attention_mask])
out["attention_mask"] = attention_mask
else:
out = {
"input_ids": input_ids,
}
if return_mask:
out["attention_mask"] = attention_mask
return out
def encode_batch(
self,
inputs,
padding=None,
max_length=8192,
return_tensors=False,
truncation=False,
add_special_tokens=True,
return_mask=False,
):
input_ids = self.sp_model.Encode(inputs)
if return_mask:
attention_mask = [[1] * len(i) for i in input_ids]
if truncation:
input_ids = [i[: max_length - 1] for i in input_ids]
if return_mask:
attention_mask = [i[: max_length - 1] for i in attention_mask]
if add_special_tokens:
input_ids = [i + [self.eos_id] for i in input_ids]
if return_mask:
attention_mask = [i + [0] for i in attention_mask]
if padding == "max_length":
input_ids_pad = []
if return_mask:
attention_mask_pad = []
for idx, i in enumerate(input_ids):
input_ids_pad.append(i + [self.pad_id] * (max_length - len(i)))
if return_mask:
j = attention_mask[idx]
attention_mask_pad.append(j + [0] * (max_length - len(j)))
input_ids = input_ids_pad
if return_mask:
attention_mask = attention_mask_pad
if return_tensors:
input_ids = torch.tensor(input_ids)
out = {
"input_ids": input_ids,
}
if return_mask:
attention_mask = torch.tensor(attention_mask)
out["attention_mask"] = attention_mask
else:
out = {
"input_ids": input_ids,
}
if return_mask:
out["attention_mask"] = attention_mask
return out
def decode(self, inputs, max_rounds=None):
inputs = inputs.tolist()
out = []
for i, ids in enumerate(inputs):
count = 0
flag = False
for j, token in enumerate(ids):
if token == self.eos_id:
if max_rounds is None:
flag = True
break
elif isinstance(max_rounds, int):
if count < max_rounds:
count += 1
else:
flag = True
break
elif isinstance(max_rounds, list):
if count < max_rounds[i]:
count += 1
else:
flag = True
break
if flag:
ids = ids[:j]
else:
ids = ids
out.append(ids)
out = self.sp_model.Decode(out)
return out
if __name__ == "__main__":
import sentencepiece as spm
from unicodedata import normalize
# Using sentencepiece may not be able to process some reserved keywords like '▁'.
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
tmp = [
"hello world",
"这是开源项目的V1版本this is the first version of a open-source project!",
"# this is a python script\nfor i in range(10):\n print(i)\n for j in range(10):\n print(j)",
]
print(tmp)
out = tokenizer(
tmp, padding="max_length", return_tensors=True, max_length=64, truncation=True
)
for k, v in out.items():
print(k, v.shape)
print(out["input_ids"])
out = tokenizer.decode(out["input_ids"])
print(out)
for i, j in zip(tmp, out):
assert normalize("NFKC", i) == j
from dataset.data_iter import create_shard_kwargs, DataIter
patterns = ["data/pretrain_data/part-wudao*.jsonl.zst"]
paths = create_shard_kwargs(patterns)
data_iter = DataIter(paths)
for i, data in enumerate(data_iter):
assert (
normalize("NFKC", data["content"])
== sp_model.Decode(sp_model.Encode(data["content"]))
or "" in data["content"]
)
if i == 1000:
break

View File

@ -1,12 +1,12 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-03-18 00:06:41
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-03-27 01:09:20
FilePath: /Open-Llama/dataset/validation.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
val_set = [
"白日依山尽,",

27
docker-compose.yml Normal file
View File

@ -0,0 +1,27 @@
services:
app:
build: .
image: open-llama-image
container_name: open-llama-container
volumes:
- .:/app
#runtime: nvidia
deploy:
resources:
limits:
memory: 20G
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
memory: 20G
memswap_limit: 60G
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- CUDA_DEVICE_ORDER=PCI_BUS_ID
- CUDA_VISIBLE_DEVICES=0
- CUDA_LAUNCH_BLOCKING=1
- TORCH_USE_CUDA_DSA=1
- PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:256

View File

@ -1,184 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-30 21:35:01
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-15 19:34:59
FilePath: /Open-Llama/inctruction_tuning.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import os
import time
import wandb
import torch
import random
import sentencepiece as spm
from torchinfo import summary
from accelerate import Accelerator
from torch.utils.data import DataLoader
from deepspeed.ops.adam import FusedAdam
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
from dataset.validation import val_set
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, DataIter
from dataset.collate_fn import collate_fn_gen
from dataset.instruction_dataset import (
preprocess_belle_gen,
preprocess_self_instruction_gen,
preprocess_belle_multiturn_chat_gen,
preprocess_instruct_code_gen,
preprocess_sharegpt_gen,
)
from configs.instruction_tuning_config import *
accelerator = Accelerator()
if accelerator.is_main_process:
wandb.init(project="LLAMA Instruction")
log_interval *= accelerator.gradient_accumulation_steps
eval_interval *= accelerator.gradient_accumulation_steps
save_interval *= accelerator.gradient_accumulation_steps
sp_model = spm.SentencePieceProcessor(model_file=tokenizer_model_path)
tokenizer = Tokenizer(sp_model)
paths = create_shard_kwargs(patterns, repeat=3)
random.shuffle(paths)
transform_dict = {
"self_instruct": preprocess_self_instruction_gen(tokenizer),
"belle_1M": preprocess_belle_gen(tokenizer),
"belle_0.5M": preprocess_belle_gen(tokenizer),
"belle_school_math_0.25M": preprocess_belle_gen(tokenizer),
"belle_multiturn_chat_0.8M": preprocess_belle_multiturn_chat_gen(tokenizer),
"instruct_to_code": preprocess_instruct_code_gen(tokenizer),
"sharegpt_90K": preprocess_sharegpt_gen(tokenizer),
}
data_set = DataIter(
paths,
transform_dict=transform_dict,
concat_docs=False,
process_index=accelerator.process_index,
num_processes=accelerator.num_processes,
)
train_loader = DataLoader(
data_set,
batch_size=train_batch_size,
# If num_workers is greater than 1, duplicate data may occur.
num_workers=0,
collate_fn=collate_fn_gen(tokenizer, max_length),
drop_last=True,
)
# smaller initializer_range make training more stable
# add stabel embedding to token embedding
raw_model = LlamaForCausalLM(
LlamaConfig(
vocab_size=tokenizer.vocab_size,
initializer_range=initializer_range,
pad_token_id=tokenizer.pad_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
)
)
ckpt = torch.load(ckpt_path, map_location="cpu")
raw_model.load_state_dict(ckpt)
raw_model.eval()
with torch.no_grad():
summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in raw_model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": weight_decay,
},
{
"params": [
p
for n, p in raw_model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
optim.zero_grad()
factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
scheduler = get_cosine_schedule_with_warmup(
optim,
num_warmup_steps=num_warmup_steps * factor,
num_training_steps=num_training_steps * factor,
)
_, model, optim, scheduler = accelerator.prepare(
train_loader, raw_model, optim, scheduler
)
print("start training...")
train_loader_iter = iter(train_loader)
global_step = 0
start_time = time.time()
for data_step in range(num_training_steps):
model.train()
with accelerator.accumulate(model):
batch = next(train_loader_iter)
for k, v in batch.items():
batch[k] = v.to(accelerator.device, non_blocking=True)
out = model(**batch, labels=batch["input_ids"])
total_loss = out.loss
losses = {"total_loss": total_loss}
accelerator.backward(total_loss)
optim.step()
scheduler.step()
optim.zero_grad()
if accelerator.sync_gradients:
global_step += 1
if data_step % log_interval == 0 and data_step > 0 and accelerator.is_main_process:
cost_time = time.time() - start_time
start_time = time.time()
tokens = train_batch_size * log_interval * max_length
wandb.log({"Training/Token per second per gpu": tokens / cost_time})
for k, v in losses.items():
wandb.log({"Losses/{}".format(k): v})
current_lr = optim.param_groups[0]["lr"]
wandb.log({"Training/LR": current_lr})
if optim.scaler is not None:
wandb.log({"Training/Loss Scale": optim.scaler.get_scale()})
wandb.log({"Training/Data Step": data_step})
wandb.log({"Training/Global Step": global_step})
accelerator.print(
"Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format(
global_step, data_step, losses["total_loss"], tokens / cost_time
)
)
if data_step % eval_interval == 0 and accelerator.is_main_process:
text_table = wandb.Table(columns=["question", "pred"])
model.eval()
with torch.no_grad():
for data in val_set:
raw_inputs = data
inputs_len = len(raw_inputs)
inputs = tokenizer(
raw_inputs, return_tensors=True, add_special_tokens=False
)
for k, v in inputs.items():
inputs[k] = v.to(accelerator.device)
pred = model.generate(
**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0
)
pred = tokenizer.decode(pred.cpu())[0]
pred = pred[inputs_len:]
text_table.add_data(raw_inputs, pred)
wandb.log({"Predictions on {}".format(global_step): text_table})
if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
if not os.path.isdir(work_dir):
os.mkdir(work_dir)
torch.save(raw_model.state_dict(), "{}/{}.pt".format(work_dir, global_step))
wandb.finish()

View File

@ -1,12 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 13:21:33
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-26 23:13:57
FilePath: /Open-Llama/models/llama.py
Description:
Building the Llama model proposed by Meta. https://arxiv.org/pdf/2302.13971.pdf
Performance and effectiveness optimization based on the implementation in the Transformer library.
https://github.com/Bayes-Song/transformers
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""

View File

@ -1,174 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 14:27:28
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-15 19:35:06
FilePath: /Open-Llama/pretrain_llama.py
Description:
pretrain GPT
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import os
import time
import wandb
import torch
import random
import sentencepiece as spm
from torchinfo import summary
from accelerate import Accelerator
from torch.utils.data import DataLoader
from deepspeed.ops.adam import FusedAdam
from transformers import LlamaForCausalLM, LlamaConfig, get_cosine_schedule_with_warmup
from dataset.validation import val_set
from dataset.tokenizer import Tokenizer
from dataset.data_iter import create_shard_kwargs, DataIter
from dataset.collate_fn import collate_fn_gen
from dataset.pretrain_dataset import (
preprocess_the_pile_gen,
preprocess_wudao_gen,
)
from configs.pretrain_config import *
accelerator = Accelerator()
if accelerator.is_main_process:
wandb.init(project="LLAMA Pretrain")
log_interval *= accelerator.gradient_accumulation_steps
eval_interval *= accelerator.gradient_accumulation_steps
save_interval *= accelerator.gradient_accumulation_steps
sp_model = spm.SentencePieceProcessor(model_file=tokenizer_model_path)
tokenizer = Tokenizer(sp_model)
paths = create_shard_kwargs(patterns)
random.shuffle(paths)
transform_dict = {
"wudao": preprocess_wudao_gen(tokenizer, max_length),
"pile": preprocess_the_pile_gen(tokenizer, max_length),
}
data_set = DataIter(
paths,
transform_dict=transform_dict,
concat_docs=False,
process_index=accelerator.process_index,
num_processes=accelerator.num_processes,
)
train_loader = DataLoader(
data_set,
batch_size=train_batch_size,
# If num_workers is greater than 1, duplicate data may occur.
num_workers=0,
collate_fn=collate_fn_gen(tokenizer, max_length),
drop_last=True,
)
# smaller initializer_range make training more stable
# add stabel embedding to token embedding
raw_model = LlamaForCausalLM(
LlamaConfig(
vocab_size=tokenizer.vocab_size,
initializer_range=initializer_range,
pad_token_id=tokenizer.pad_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
)
)
raw_model.eval()
with torch.no_grad():
summary(raw_model.cuda(), input_data=torch.ones(1, 64, dtype=torch.int64).cuda())
no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in raw_model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": weight_decay,
},
{
"params": [
p
for n, p in raw_model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optim = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95))
optim.zero_grad()
factor = accelerator.num_processes / accelerator.gradient_accumulation_steps
scheduler = get_cosine_schedule_with_warmup(
optim,
num_warmup_steps=num_warmup_steps * factor,
num_training_steps=num_training_steps * factor,
)
_, model, optim, scheduler = accelerator.prepare(
train_loader, raw_model, optim, scheduler
)
print("start training...")
train_loader_iter = iter(train_loader)
global_step = 0
start_time = time.time()
for data_step in range(num_training_steps):
model.train()
with accelerator.accumulate(model):
batch = next(train_loader_iter)
for k, v in batch.items():
batch[k] = v.to(accelerator.device, non_blocking=True)
out = model(**batch, labels=batch["input_ids"])
total_loss = out.loss
losses = {"total_loss": total_loss}
accelerator.backward(total_loss)
optim.step()
scheduler.step()
optim.zero_grad()
if accelerator.sync_gradients:
global_step += 1
if data_step % log_interval == 0 and data_step > 0 and accelerator.is_main_process:
cost_time = time.time() - start_time
start_time = time.time()
tokens = train_batch_size * log_interval * max_length
wandb.log({"Training/Token per second per gpu": tokens / cost_time})
for k, v in losses.items():
wandb.log({"Losses/{}".format(k): v})
current_lr = optim.param_groups[0]["lr"]
wandb.log({"Training/LR": current_lr})
if optim.scaler is not None:
wandb.log({"Training/Loss Scale": optim.scaler.get_scale()})
wandb.log({"Training/Data Step": data_step})
wandb.log({"Training/Global Step": global_step})
accelerator.print(
"Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format(
global_step, data_step, losses["total_loss"], tokens / cost_time
)
)
if data_step % eval_interval == 0 and accelerator.is_main_process:
text_table = wandb.Table(columns=["question", "pred"])
model.eval()
with torch.no_grad():
for data in val_set:
raw_inputs = data
inputs_len = len(raw_inputs)
inputs = tokenizer(
raw_inputs, return_tensors=True, add_special_tokens=False
)
for k, v in inputs.items():
inputs[k] = v.to(accelerator.device)
pred = model.generate(
**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0
)
pred = tokenizer.decode(pred.cpu())[0]
pred = pred[inputs_len:]
text_table.add_data(raw_inputs, pred)
wandb.log({"Predictions on {}".format(global_step): text_table})
if data_step % save_interval == 0 and data_step > 0 and accelerator.is_main_process:
if not os.path.isdir(work_dir):
os.mkdir(work_dir)
torch.save(raw_model.state_dict(), "{}/{}.pt".format(work_dir, global_step))
wandb.finish()

View File

@ -3,7 +3,6 @@ torchvision
torchaudio
zstandard
accelerate
datasets
wandb
deepspeed
absl-py
@ -15,6 +14,8 @@ seaborn
sentencepiece
triton
functorch==1.13.1
xformers
xformers==0.0.16
gradio
git+https://github.com/Bayes-Song/transformers.git
peft
transformers
fsspec==2023.9.2

View File

@ -1,75 +0,0 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-31 13:26:15
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-06 03:45:44
FilePath: /Open-Llama/server.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import torch
import gradio as gr
import sentencepiece as spm
from dataset.tokenizer import Tokenizer
from transformers import LlamaForCausalLM, LlamaConfig
sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
)
tokenizer = Tokenizer(sp_model)
raw_model = LlamaForCausalLM(
LlamaConfig(
vocab_size=tokenizer.vocab_size,
initializer_range=0.01,
pad_token_id=tokenizer.pad_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
)
)
ckpt = torch.load(
"data/saved_ckpt/instruction_tuning_3_epochs/37001.pt", map_location="cpu"
)
raw_model.load_state_dict(ckpt)
raw_model.eval()
model = raw_model.cuda()
print("ready")
def question_answer(prompt):
print(prompt)
raw_inputs = "user:{}\nsystem:".format(prompt)
inputs_len = len(raw_inputs)
inputs = tokenizer(raw_inputs, return_tensors=True, add_special_tokens=False)
for k, v in inputs.items():
inputs[k] = v.cuda()
pred = model.generate(**inputs, max_new_tokens=512, do_sample=True)
pred = tokenizer.decode(pred.cpu())[0]
pred = pred[inputs_len:]
print(pred)
return pred
demo = gr.Interface(
fn=question_answer,
inputs="text",
outputs="text",
examples=[
"帮我写一封邮件,内容是咨询教授本学期量子力学课程的时间表?并且希望教授推荐一些相关书籍",
"情人节送女朋友什么礼物预算500",
"我今天肚子有点不舒服,晚饭有什么建议么",
"可以总结一下小说三体的核心内容么?",
"Can you explain to me what quantum mechanics is and how it relates to quantum computing?",
"请帮我写一个AI驱动的幼儿教育APP的商业计划书",
"用python实现一个快速排序",
],
title="Open-Llama",
description="不基于其他预训练模型,完全使用[Open-Llama](https://github.com/Bayes-Song/Open-Llama)项目从0开始训练的Instruct-GPT模型总训练成本不超过2w美元。由于请求需要经Gradio进行转发可能出现请求丢失的现象当长时间无响应如20s以上可刷新重试。当前体验服务生成的所有内容都是由人工智能模型生成我们对其生成内容的准确性、完整性和功能性不做任何保证并且其生成的内容不代表我们的态度或观点。",
article="联系方式: sl12160010@gmail.com 对于该项目有任何意见和建议都欢迎联系我",
).queue(concurrency_count=1)
demo.launch(share=True)

230
solver/trainer.py Normal file
View File

@ -0,0 +1,230 @@
"""
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-24 20:05:21
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-08 22:51:42
FilePath: /Open-Llama/solver/trainer.py
Description:
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import time
import wandb
import torch
import logging
from torchinfo import summary
from deepspeed.ops.adam import FusedAdam
from transformers import get_cosine_schedule_with_warmup
from dataset.validation import val_set
class Trainer:
def __init__(self, config, raw_model, train_loader, tokenizer, accelerator):
self.config = config
self.raw_model = raw_model
self.train_loader = train_loader
self.tokenizer = tokenizer
self.accelerator = accelerator
self.train_and_eval = config["train"].get("train_and_eval", False)
self.gradient_accumulation_steps = config["train"].get(
"gradient_accumulation_steps", 1
)
self.lr_scheduler_factor = (
accelerator.num_processes / accelerator.gradient_accumulation_steps
)
self.log_interval = (
self.config["log_interval"] * accelerator.gradient_accumulation_steps
)
self.eval_interval = (
self.config["eval_interval"] * accelerator.gradient_accumulation_steps
)
self.save_interval = (
self.config["save_interval"] * accelerator.gradient_accumulation_steps
)
self.work_dir = self.config["work_dir"]
# self.get_model_info()
if accelerator.is_main_process:
wandb.init(project=self.config["project_name"])
def get_model_info(self):
with torch.no_grad():
summary(
self.raw_model.cuda(),
input_data=torch.ones(1, 64, dtype=torch.int64).cuda(),
)
def get_optimizer(self):
no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"]
if self.config["train"].get("use_lora", False):
optimizer_grouped_parameters = self.raw_model.parameters()
else:
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in self.raw_model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": self.config["train"]["weight_decay"],
},
{
"params": [
p
for n, p in self.raw_model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
self.optim = FusedAdam(
optimizer_grouped_parameters,
lr=self.config["train"]["lr"],
betas=(0.9, 0.95),
)
def get_lr_scheduler(self):
self.scheduler = get_cosine_schedule_with_warmup(
self.optim,
num_warmup_steps=self.config["train"]["num_warmup_steps"]
* self.lr_scheduler_factor,
num_training_steps=self.config["train"]["num_training_steps"]
* self.lr_scheduler_factor,
)
def prepare(self):
(
_,
self.model,
self.optim,
self.scheduler,
) = self.accelerator.prepare(
self.train_loader, self.raw_model, self.optim, self.scheduler
)
self.optim.zero_grad()
self.global_step = 0
try:
self.accelerator.load_state(self.work_dir)
self.global_step = self.scheduler.scheduler._step_count - 1
self.global_step = self.global_step // self.accelerator.num_processes
logging.warning("Restored ckpt from {}".format(self.work_dir))
except:
logging.warning("No ckpt found in {}".format(self.work_dir))
if self.global_step > 0:
skip_steps = self.global_step * self.gradient_accumulation_steps
logging.warning("Skiped {} steps.".format(skip_steps))
self.train_loader_skiped = self.accelerator.skip_first_batches(
self.train_loader, num_batches=skip_steps
)
else:
self.train_loader_skiped = self.train_loader
self.accelerator.wait_for_everyone()
def train_step(self, batch):
out = self.model(**batch)
total_loss = out.loss
losses = {"total_loss": total_loss}
self.accelerator.backward(total_loss)
self.optim.step()
self.scheduler.step()
self.optim.zero_grad()
return losses
def train(self):
self.get_optimizer()
self.get_lr_scheduler()
self.prepare()
self.start_time = time.time()
self.epoch = 0
self.data_step = 0
while True:
if self.data_step >= self.config["train"]["num_training_steps"]:
break
if self.epoch == 0:
train_loader = self.train_loader_skiped
else:
train_loader = self.train_loader
for batch in train_loader:
# end training
if self.data_step >= self.config["train"]["num_training_steps"]:
break
# data to device
for k, v in batch.items():
batch[k] = v.to(self.accelerator.device, non_blocking=True)
self.model.train()
# train step
with self.accelerator.accumulate(self.model):
losses = self.train_step(batch)
if self.accelerator.sync_gradients:
self.global_step += 1
# log
if (
self.data_step % self.log_interval == 0
and self.data_step > 0
and self.accelerator.is_main_process
):
self.log(losses)
# eval/vis model output
if (
self.data_step % self.eval_interval == 0
and self.accelerator.is_main_process
and self.train_and_eval
):
self.eval()
# save state
if self.data_step % self.save_interval == 0 and self.data_step > 0:
self.accelerator.save_state(self.work_dir)
self.data_step += 1
self.epoch += 1
wandb.finish()
def log(self, losses):
cost_time = time.time() - self.start_time
self.start_time = time.time()
tokens = (
self.config["train"]["train_batch_size"]
* self.log_interval
* self.config["data"]["seq_length"]
)
wandb.log({"Training/Token per second per gpu": tokens / cost_time})
for k, v in losses.items():
wandb.log({"Losses/{}".format(k): v})
current_lr = self.optim.param_groups[0]["lr"]
wandb.log({"Training/LR": current_lr})
if self.optim.scaler is not None:
wandb.log({"Training/Loss Scale": self.optim.scaler.get_scale()})
wandb.log({"Training/Data Step": self.data_step})
wandb.log({"Training/Global Step": self.global_step})
wandb.log({"Training/Epoch": self.epoch})
self.accelerator.print(
"Epoch: {}, Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format(
self.epoch,
self.global_step,
self.data_step,
losses["total_loss"],
tokens / cost_time,
)
)
def eval(self):
text_table = wandb.Table(columns=["question", "pred"])
self.model.eval()
with torch.no_grad():
for data in val_set:
raw_inputs = data
inputs = self.tokenizer(
raw_inputs,
return_tensors="pt",
add_special_tokens=False,
return_attention_mask=False,
)
input_length = inputs["input_ids"].shape[1]
for k, v in inputs.items():
inputs[k] = v.to(self.accelerator.device)
pred = self.model.generate(
**inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0
)
pred = pred[0, input_length:]
pred = self.tokenizer.decode(pred.cpu(), skip_special_tokens=True)
text_table.add_data(raw_inputs, pred)
wandb.log({"Predictions on {}".format(self.global_step): text_table})

118
train_lm.py Normal file
View File

@ -0,0 +1,118 @@
"""
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-12 19:12:42
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-17 22:20:32
FilePath: /Open-Llama/train_lm.py
Description:
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import yaml
import math
import logging
from absl import app
from absl import flags
from accelerate import Accelerator
from torch.utils.data import DataLoader
from peft import LoraConfig, TaskType, get_peft_model
from datasets.distributed import split_dataset_by_node
from transformers import AutoConfig, AutoModelForCausalLM, LlamaTokenizer
from dataset.dataset import construct_dataset
from solver.trainer import Trainer
FLAGS = flags.FLAGS
flags.DEFINE_string("train_config", None, "Training config path")
flags.DEFINE_string(
"model_config", "configs/model_configs/7B.json", "Model config path"
)
def main(argv):
with open(FLAGS.train_config, "r", encoding="utf-8") as fp:
config = yaml.load(fp, Loader=yaml.FullLoader)
accelerator = Accelerator(
gradient_accumulation_steps=config["train"].get(
"gradient_accumulation_steps", 1
)
)
tokenizer = LlamaTokenizer(
config["data"]["tokenizer_model_path"],
pad_token="<pad>",
add_bos_token=False,
add_eos_token=True,
)
data_config = config["data"]
if data_config.get("split_by_shard", False):
train_dataset = construct_dataset(
data_config, tokenizer, world_size=accelerator.num_processes
)
else:
train_dataset = construct_dataset(data_config, tokenizer)
train_dataset = split_dataset_by_node(
train_dataset,
rank=accelerator.process_index,
world_size=accelerator.num_processes,
)
train_loader = DataLoader(
train_dataset,
batch_size=config["train"]["train_batch_size"],
num_workers=config["train"]["train_num_workers"],
prefetch_factor=config["train"].get("prefetch_factor", 2),
pin_memory=True,
)
# smaller initializer_range make training more stable
# add stabel embedding to token embedding
model_config = AutoConfig.from_pretrained(FLAGS.model_config)
# Make the vocab size divisible by 16
# https://huggingface.co/docs/transformers/main_classes/deepspeed#how-to-choose-which-zero-stage-and-offloads-to-use-for-best-performance
# https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/
# vocab_size = math.ceil(tokenizer.vocab_size / 16) * 16
# logging.warning(
# "Round vocab_size from {} to {}.".format(tokenizer.vocab_size, vocab_size)
# )
vocab_size = tokenizer.vocab_size
model_config.vocab_size = vocab_size
model_config.pad_token_id = tokenizer.pad_token_id
# 使用AutoModel可以在Deepspeed.zero.Init()下正确的生效而直接使用如OpenLlamaModel不能正确生效导致浪费大量内存空间
# https://github.com/huggingface/accelerate/pull/932
if config["train"]["ckpt"] is not None:
raw_model = AutoModelForCausalLM.from_pretrained(
config["train"]["ckpt"], config=model_config
)
logging.warning("Loaded ckpt from: {}".format(config["train"]["ckpt"]))
else:
raw_model = AutoModelForCausalLM.from_config(model_config)
# lora
if config["train"].get("use_lora", False):
# gradient ckpt bug, https://github.com/huggingface/transformers/issues/23170
if hasattr(raw_model, "enable_input_require_grads"):
raw_model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
raw_model.get_input_embeddings().register_forward_hook(
make_inputs_require_grad
)
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "v_proj"],
inference_mode=False,
r=1,
lora_alpha=32,
lora_dropout=0.1,
)
raw_model = get_peft_model(raw_model, peft_config)
raw_model.print_trainable_parameters()
if config["train"].get("gradient_checkpointing_enable", False):
raw_model.gradient_checkpointing_enable()
trainer = Trainer(config, raw_model, train_loader, tokenizer, accelerator)
trainer.train()
if __name__ == "__main__":
app.run(main)

76
utils/convert_ckpt.py Normal file
View File

@ -0,0 +1,76 @@
"""
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-28 19:55:13
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-06 23:30:29
FilePath: /Open-Llama/utils/convert_ckpt.py
Description:
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import torch
import sentencepiece as spm
sp_model = spm.SentencePieceProcessor(
model_file="configs/tokenizer_models/llama_tokenizer_extended.model"
)
merged_vocab_size = sp_model.vocab_size()
ckpt = torch.load("data/llama_raw_ckpt/7B/consolidated.00.pth")
raw_vocab_size, hidden_size = ckpt["tok_embeddings.weight"].shape
extended_tok_embeddings = torch.randn(merged_vocab_size - raw_vocab_size, hidden_size)
extended_tok_embeddings = extended_tok_embeddings * 0.001
ckpt["tok_embeddings.weight"] = torch.cat(
[ckpt["tok_embeddings.weight"], extended_tok_embeddings], dim=0
)
extended_out_embeddings = torch.randn(merged_vocab_size - raw_vocab_size, hidden_size)
extended_out_embeddings = extended_out_embeddings * 0.001
ckpt["output.weight"] = torch.cat(
[ckpt["output.weight"], extended_out_embeddings], dim=0
)
rename_map = {
"tok_embeddings.weight": "model.embed_tokens.weight",
"norm.weight": "model.norm.weight",
"output.weight": "lm_head.weight",
}
for f, t in rename_map.items():
v = ckpt.pop(f)
ckpt[t] = v
from_names = [
"layers.{}.attention.wq.weight",
"layers.{}.attention.wk.weight",
"layers.{}.attention.wv.weight",
"layers.{}.attention.wo.weight",
"layers.{}.feed_forward.w1.weight",
"layers.{}.feed_forward.w2.weight",
"layers.{}.feed_forward.w3.weight",
"layers.{}.attention_norm.weight",
"layers.{}.ffn_norm.weight",
"layers.{}.attention.inner_attention.rope.freqs",
]
to_names = [
"model.layers.{}.self_attn.q_proj.weight",
"model.layers.{}.self_attn.k_proj.weight",
"model.layers.{}.self_attn.v_proj.weight",
"model.layers.{}.self_attn.o_proj.weight",
"model.layers.{}.mlp.gate_proj.weight",
"model.layers.{}.mlp.down_proj.weight",
"model.layers.{}.mlp.up_proj.weight",
"model.layers.{}.input_layernorm.weight",
"model.layers.{}.post_attention_layernorm.weight",
"model.layers.{}.self_attn.rotary_emb.inv_freq",
]
for layer in range(32):
for f, t in zip(from_names, to_names):
f = f.format(layer)
t = t.format(layer)
v = ckpt.pop(f)
ckpt[t] = v
torch.save(ckpt, "data/llama_raw_ckpt/7B/extended.pth")

27
utils/merge_tokenizer.py Normal file
View File

@ -0,0 +1,27 @@
from tqdm import tqdm
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as model
raw_model = model.ModelProto()
raw_model.ParseFromString(
open("configs/tokenizer_models/llama_tokenizer.model", "rb").read()
)
exist_pieces = set([p.piece for p in raw_model.pieces])
cn_model = model.ModelProto()
cn_model.ParseFromString(
open("configs/tokenizer_models/4w_cn_vocab_wudao15.model", "rb").read()
)
for p in tqdm(cn_model.pieces, total=len(cn_model.pieces)):
if p.piece not in exist_pieces:
raw_model.pieces.append(p)
with open("configs/tokenizer_models/llama_tokenizer_extended.model", "wb") as f:
f.write(raw_model.SerializeToString())
sp_model = spm.SentencePieceProcessor(
model_file="configs/tokenizer_models/llama_tokenizer_extended.model"
)
print("merged vocab size: {}".format(sp_model.vocab_size()))

View File

@ -1,12 +1,12 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-08 22:44:44
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-04-08 23:15:57
FilePath: /Open-Llama/speed_test/accelerate/run.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import time
import torch
@ -20,13 +20,15 @@ vocab_size = 32000
total_step = 2
use_activation_ckpt = True
class FakeSet(torch.utils.data.Dataset):
def __getitem__(self, idx):
return torch.randint(0, vocab_size, (seq_length, ))
return torch.randint(0, vocab_size, (seq_length,))
def __len__(self):
return 1000000000
accelerator = Accelerator()
raw_model = LlamaForCausalLM(
LlamaConfig(
@ -39,15 +41,18 @@ optimizer = FusedAdam(raw_model.parameters(), lr=1e-5)
train_loader = torch.utils.data.DataLoader(FakeSet(), batch_size=batch_size)
if accelerator.distributed_type == DistributedType.FSDP:
accelerator.print('FSDP')
accelerator.print("FSDP")
model = accelerator.prepare(raw_model)
optimizer, train_loader = accelerator.prepare(optimizer, train_loader)
else:
model, optimizer, train_loader = accelerator.prepare(raw_model, optimizer, train_loader)
model, optimizer, train_loader = accelerator.prepare(
raw_model, optimizer, train_loader
)
def train(model, optimizer, train_loader):
start_time = time.time()
for i, batch in enumerate(train_loader):
for i, batch in enumerate(train_loader):
if i == total_step:
break
optimizer.zero_grad()
@ -58,4 +63,5 @@ def train(model, optimizer, train_loader):
end_time = time.time()
return end_time - start_time
accelerator.print('total time: {}'.format(train(model, optimizer, train_loader)))
accelerator.print("total time: {}".format(train(model, optimizer, train_loader)))

View File

@ -1,12 +1,12 @@
###
# @Author: LiangSong(sl12160010@gmail.com)
# @Author: s-JoL(sl12160010@gmail.com)
# @Date: 2023-04-08 22:44:27
# @LastEditors: LiangSong(sl12160010@gmail.com)
# @LastEditors: s-JoL(sl12160010@gmail.com)
# @LastEditTime: 2023-04-11 21:58:43
# @FilePath: /Open-Llama/speed_test/accelerate/run.sh
# @Description:
#
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
# Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
###
total_gpu=8
accelerate launch --config_file deepspeed_stage2.yaml --main_process_ip 127.0.0.1 --main_process_port 23335 --num_processes $total_gpu run.py

View File

@ -1,12 +1,12 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-11 20:07:35
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-04-11 21:56:23
FilePath: /Open-Llama/speed_test/colossal-ai/run.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import os
from functools import partial
@ -23,7 +23,14 @@ from torch.nn.parallel import DistributedDataParallel as DDP
import colossalai
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
from colossalai.tensor import (
ColoParameter,
ComputePattern,
ComputeSpec,
ProcessGroup,
ReplicaSpec,
ShardSpec,
)
from colossalai.utils import get_current_device
from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
@ -35,7 +42,7 @@ def parse_args():
parser.add_argument(
"--distplan",
type=str,
default='CAI_Gemini',
default="CAI_Gemini",
help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
)
parser.add_argument(
@ -47,14 +54,13 @@ def parse_args():
parser.add_argument(
"--placement",
type=str,
default='cpu',
default="cpu",
help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
)
parser.add_argument(
"--shardinit",
action='store_true',
help=
"Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
action="store_true",
help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
)
parser.add_argument(
"--batch_size",
@ -105,7 +111,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
class GPTLMLoss(nn.Module):
def __init__(self):
super().__init__()
self.loss_fn = nn.CrossEntropyLoss()
@ -114,7 +119,9 @@ class GPTLMLoss(nn.Module):
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
return self.loss_fn(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)
def get_cpu_mem():
@ -125,8 +132,8 @@ def get_gpu_mem():
return torch.cuda.memory_allocated() / 1024**2
def get_mem_info(prefix=''):
return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
def get_mem_info(prefix=""):
return f"{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB"
def get_model_size(model: nn.Module):
@ -142,11 +149,11 @@ def model_size_formatter(numel: int) -> str:
MB_SIZE = 10**6
KB_SIZE = 10**3
if numel >= GB_SIZE:
return f'{numel / GB_SIZE:.1f}B'
return f"{numel / GB_SIZE:.1f}B"
elif numel >= MB_SIZE:
return f'{numel / MB_SIZE:.1f}M'
return f"{numel / MB_SIZE:.1f}M"
elif numel >= KB_SIZE:
return f'{numel / KB_SIZE:.1f}K'
return f"{numel / KB_SIZE:.1f}K"
else:
return str(numel)
@ -154,7 +161,7 @@ def model_size_formatter(numel: int) -> str:
def set_cpu_maximum_parallelism():
conf_str = torch.__config__.parallel_info()
inter_str = conf_str.split("hardware_concurrency() : ")[1]
max_concurrency = inter_str.split('\n')[0]
max_concurrency = inter_str.split("\n")[0]
os.environ["OMP_NUM_THREADS"] = max_concurrency
print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.")
@ -170,7 +177,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
for mn, module in model.named_modules():
for pn, param in module.named_parameters(recurse=False):
# NOTE() a param maybe shared by two modules
if hasattr(param, 'visited'):
if hasattr(param, "visited"):
continue
# if shard init, then convert param to replica and use the dp-only ProcessGroup
@ -179,22 +186,22 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
param.set_process_group(pg)
# shard it w.r.t tp pattern
if 'mlp.c_fc' in mn:
if 'weight' in pn or 'bias' in pn:
split_param_col_tp1d(param, pg) # colmn slice
if "mlp.c_fc" in mn:
if "weight" in pn or "bias" in pn:
split_param_col_tp1d(param, pg) # colmn slice
# keep the shape of the output from c_fc
param.compute_spec.set_output_replicate(False)
else:
param.set_dist_spec(ReplicaSpec())
elif 'mlp.c_proj' in mn:
if 'weight' in pn:
split_param_row_tp1d(param, pg) # row slice
elif "mlp.c_proj" in mn:
if "weight" in pn:
split_param_row_tp1d(param, pg) # row slice
else:
param.set_dist_spec(ReplicaSpec())
elif 'wte' in mn or 'wpe' in mn:
split_param_col_tp1d(param, pg) # colmn slice
elif 'c_attn' in mn or 'c_proj' in mn:
split_param_col_tp1d(param, pg) # colmn slice
elif "wte" in mn or "wpe" in mn:
split_param_col_tp1d(param, pg) # colmn slice
elif "c_attn" in mn or "c_proj" in mn:
split_param_col_tp1d(param, pg) # colmn slice
else:
param.set_dist_spec(ReplicaSpec())
param.visited = True
@ -209,7 +216,13 @@ def main():
args = parse_args()
# if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
if args.distplan not in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]:
if args.distplan not in [
"CAI_ZeRO1",
"CAI_ZeRO2",
"CAI_Gemini",
"Pytorch_DDP",
"Pytorch_ZeRO",
]:
raise TypeError(f"{args.distplan} is error")
# batch size per DP degree
@ -221,14 +234,18 @@ def main():
WARMUP_STEPS = 1
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median"
PROF_FLAG = False # The flag of profiling, False by default
assert (
NUM_STEPS - WARMUP_STEPS
) % 2 == 1, "the number of valid steps should be odd to take the median"
PROF_FLAG = False # The flag of profiling, False by default
disable_existing_loggers()
colossalai.launch_from_torch(config={})
logger = get_dist_logger()
logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
logger.info(
f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0]
)
# build criterion
criterion = GPTLMLoss()
@ -244,10 +261,12 @@ def main():
raise RuntimeError("You can only use shardinit with CAI_Gemini")
# build GPT model
with ColoInitContext(device=get_current_device(),
dtype=torch.half,
default_dist_spec=default_dist_spec,
default_pg=shard_pg):
with ColoInitContext(
device=get_current_device(),
dtype=torch.half,
default_dist_spec=default_dist_spec,
default_pg=shard_pg,
):
model = model_builder(VOCAB_SIZE, checkpoint=True)
tp_pg = ProcessGroup(tp_degree=args.tp_degree)
@ -259,15 +278,21 @@ def main():
# asign running configurations
gemini_config = None
if args.distplan.startswith("CAI_ZeRO"):
optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
optim_config = dict(
reduce_bucket_size=12 * 1024 * 1024,
overlap_communication=True,
verbose=True,
)
elif args.distplan == "CAI_Gemini":
gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
device=get_current_device(),
placement_policy=args.placement,
pin_memory=True,
hidden_dim=model.model.config.hidden_size,
search_range_mb=128)
optim_config = dict(gpu_margin_mem_ratio=0.)
gemini_config = dict(
strict_ddp_mode=args.tp_degree == 1,
device=get_current_device(),
placement_policy=args.placement,
pin_memory=True,
hidden_dim=model.model.config.hidden_size,
search_range_mb=128,
)
optim_config = dict(gpu_margin_mem_ratio=0.0)
else:
raise RuntimeError
@ -287,7 +312,7 @@ def main():
model = zero_model_wrapper(model, zero_stage, gemini_config)
optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
logger.info(get_mem_info(prefix="After init optim, "), ranks=[0])
elif args.distplan.startswith("Pytorch"):
assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
model = model_builder(VOCAB_SIZE, checkpoint=True).cuda()
@ -296,14 +321,17 @@ def main():
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
elif args.distplan.endswith("ZeRO"):
from torch.distributed.optim import ZeroRedundancyOptimizer
optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
optimizer = ZeroRedundancyOptimizer(
model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3
)
else:
raise RuntimeError
# model is shared after TP
numel = get_model_size(model)
logger.info(f"the size of testing model size is {model_size_formatter(numel)}.")
logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
logger.info(get_mem_info(prefix="After init model, "), ranks=[0])
# Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
# = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
@ -325,7 +353,7 @@ def main():
torch.cuda.synchronize()
fwd_end = time()
fwd_time = fwd_end - start
logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
logger.info(get_mem_info(prefix=f"[{n + 1}/{NUM_STEPS}] Forward "), ranks=[0])
if args.distplan.startswith("CAI"):
optimizer.backward(loss)
@ -337,13 +365,15 @@ def main():
torch.cuda.synchronize()
bwd_end = time()
bwd_time = bwd_end - fwd_end
logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Backward '), ranks=[0])
logger.info(get_mem_info(prefix=f"[{n + 1}/{NUM_STEPS}] Backward "), ranks=[0])
optimizer.step()
torch.cuda.synchronize()
optim_time = time() - bwd_end
step_time = time() - start
logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Optimizer step '), ranks=[0])
logger.info(
get_mem_info(prefix=f"[{n + 1}/{NUM_STEPS}] Optimizer step "), ranks=[0]
)
step_tflops = get_tflops_func(step_time)
logger.info(
@ -353,10 +383,12 @@ def main():
if n >= WARMUP_STEPS:
tflops_list.append(step_tflops)
demo_profiler = get_profile_context(PROF_FLAG,
WARMUP_STEPS,
NUM_STEPS - WARMUP_STEPS,
save_dir=f"profile/{get_time_stamp()}-demo")
demo_profiler = get_profile_context(
PROF_FLAG,
WARMUP_STEPS,
NUM_STEPS - WARMUP_STEPS,
save_dir=f"profile/{get_time_stamp()}-demo",
)
with demo_profiler as prof:
start_time = time()
@ -364,7 +396,7 @@ def main():
train_step()
prof.step()
end_time = time()
print('total time: {}'.format(end_time - start_time))
print("total time: {}".format(end_time - start_time))
tflops_list.sort()
median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
@ -372,5 +404,5 @@ def main():
torch.cuda.synchronize()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -2,11 +2,15 @@ import time
from contextlib import nullcontext
import torch
from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
from torch.profiler import (
ProfilerActivity,
profile,
schedule,
tensorboard_trace_handler,
)
class DummyProfiler:
def __init__(self):
self.step_number = 0
@ -16,7 +20,9 @@ class DummyProfiler:
# Randomly Generated Data
def get_data(batch_size, seq_len, vocab_size):
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
input_ids = torch.randint(
0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device()
)
attention_mask = torch.ones_like(input_ids)
return input_ids, attention_mask
@ -27,15 +33,17 @@ def get_tflops(model_numel, batch_size, seq_len, step_time):
def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
if enable_flag:
return profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
on_trace_ready=tensorboard_trace_handler(save_dir),
record_shapes=True,
profile_memory=True)
return profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
on_trace_ready=tensorboard_trace_handler(save_dir),
record_shapes=True,
profile_memory=True,
)
else:
return nullcontext(DummyProfiler())
def get_time_stamp():
cur_time = time.strftime("%d-%H:%M", time.localtime())
return cur_time
return cur_time

View File

@ -1,12 +1,12 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-04-11 20:07:35
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-04-11 21:56:07
FilePath: /Open-Llama/speed_test/lightning/run.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import time
import torch
@ -22,13 +22,15 @@ vocab_size = 32000
total_step = 100
use_activation_ckpt = False
class FakeSet(torch.utils.data.Dataset):
def __getitem__(self, idx):
return torch.randint(0, vocab_size, (seq_length, ))
return torch.randint(0, vocab_size, (seq_length,))
def __len__(self):
return 1000000000
class SpeedTest(pl.LightningModule):
def __init__(self):
super().__init__()
@ -45,7 +47,7 @@ class SpeedTest(pl.LightningModule):
out = self.model(batch, labels=batch)
loss = out.loss
if self.start_time is None:
print('start')
print("start")
self.start_time = time.time()
return loss
@ -53,23 +55,26 @@ class SpeedTest(pl.LightningModule):
optimizer = FusedAdam(self.trainer.model.parameters(), lr=1e-5)
return optimizer
model = SpeedTest()
train_loader = torch.utils.data.DataLoader(FakeSet(), batch_size=batch_size)
strategy=DeepSpeedStrategy(
strategy = DeepSpeedStrategy(
stage=2,
offload_optimizer=False,
offload_parameters=False,
process_group_backend="nccl"
process_group_backend="nccl",
)
trainer = pl.Trainer(
limit_train_batches=total_step,
limit_train_batches=total_step,
max_epochs=1,
devices=8,
accelerator="gpu",
strategy=strategy,
precision=16,
enable_checkpointing=False)
precision=16,
enable_checkpointing=False,
)
def train(model, train_loader):
start_time = time.time()
@ -77,4 +82,5 @@ def train(model, train_loader):
end_time = time.time()
return end_time - model.start_time
print('total time: {}'.format(train(model, train_loader)))
print("total time: {}".format(train(model, train_loader)))

View File

@ -1,33 +1,43 @@
"""
Author: LiangSong(sl12160010@gmail.com)
Author: s-JoL(sl12160010@gmail.com)
Date: 2023-03-24 20:49:03
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-05 22:40:29
FilePath: /Open-Llama/dataset/train_tokenizer.py
LastEditors: s-JoL(sl12160010@gmail.com)
LastEditTime: 2023-05-06 23:34:14
FilePath: /Open-Llama/utils/train_tokenizer.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved.
"""
import random
from dataset.data_iter import DataIter, create_shard_kwargs
from glob import glob
from datasets import load_dataset
wudao_patterns = [
"data/pretrain_data/part-wudao-*.jsonl.zst",
]
wudao_paths = create_shard_kwargs(wudao_patterns)
random.seed(42)
wudao_pattern = "data/pretrain_data/part-wudao-*.jsonl.zst"
wudao_paths = glob(wudao_pattern)
random.shuffle(wudao_paths)
pile_patterns = [
"data/pretrain_data/part-pile-*.jsonl.zst",
]
pile_paths = create_shard_kwargs(pile_patterns)
pile_pattern = "data/pretrain_data/part-pile-*.jsonl.zst"
pile_paths = glob(pile_pattern)
random.shuffle(pile_paths)
paths = wudao_paths[:5] + pile_paths[:10]
transform_dict = {
"wudao": lambda line: line["title"] + "\n" + line["content"],
"pile": lambda line: line["text"],
}
data_iter = iter(DataIter(paths, transform_dict))
dataset = load_dataset("json", data_files=paths, split="train", streaming=True)
dataset = dataset.shuffle(seed=42)
def transform(dataset):
for line in dataset:
if "title" in line and "content" in line:
yield line["title"] + "\n" + line["content"]
else:
yield line["text"]
data_iter = transform(dataset)
import io
import sentencepiece as spm
@ -52,11 +62,12 @@ spm.SentencePieceTrainer.train(
# reserve whitespace and \n and \t etc. for code generation
allow_whitespace_only_pieces=True,
remove_extra_whitespaces=False,
# Llama use identity instead of nfkc
normalization_rule_name="nfkc",
)
# Serialize the model as file.
with open("configs/10w_vocab_wudao5_pile10.model", "wb") as f:
with open("configs/tokenizer_models/10w_vocab_wudao5_pile10.model", "wb") as f:
f.write(model.getvalue())
# Directly load the model from serialized model.