update ShareGPT_90K preprocess
This commit is contained in:
parent
154456c976
commit
dba2e2d680
|
@ -2,7 +2,7 @@
|
||||||
* @Author: LiangSong(sl12160010@gmail.com)
|
* @Author: LiangSong(sl12160010@gmail.com)
|
||||||
* @Date: 2023-03-10 21:18:35
|
* @Date: 2023-03-10 21:18:35
|
||||||
* @LastEditors: LiangSong(sl12160010@gmail.com)
|
* @LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
* @LastEditTime: 2023-04-29 20:29:31
|
* @LastEditTime: 2023-05-04 08:33:26
|
||||||
* @FilePath: /Open-Llama/README.md
|
* @FilePath: /Open-Llama/README.md
|
||||||
* @Description:
|
* @Description:
|
||||||
*
|
*
|
||||||
|
@ -252,10 +252,10 @@ Total mult-adds (G): 7.04
|
||||||
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
|
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
|
||||||
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
|
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
|
||||||
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
|
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
|
||||||
- [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K)
|
- [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
|
||||||
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
|
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
|
||||||
|
|
||||||
其中ShareGPT52K数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。
|
其中ShareGPT_Vicuna_unfiltered数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。
|
||||||
我们对原始数据进行了一些预处理,格式如下
|
我们对原始数据进行了一些预处理,格式如下
|
||||||
```
|
```
|
||||||
user: {prompt}\nsystem: {completion}</s>
|
user: {prompt}\nsystem: {completion}</s>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
* @Author: LiangSong(sl12160010@gmail.com)
|
* @Author: LiangSong(sl12160010@gmail.com)
|
||||||
* @Date: 2023-03-10 21:18:35
|
* @Date: 2023-03-10 21:18:35
|
||||||
* @LastEditors: LiangSong(sl12160010@gmail.com)
|
* @LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
* @LastEditTime: 2023-04-29 20:30:12
|
* @LastEditTime: 2023-05-04 08:33:45
|
||||||
* @FilePath: /Open-Llama/README_en.md
|
* @FilePath: /Open-Llama/README_en.md
|
||||||
* @Description:
|
* @Description:
|
||||||
*
|
*
|
||||||
|
@ -266,10 +266,10 @@ We use the currently available seven datasets for Instruction-tuning, and more t
|
||||||
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
|
- [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
|
||||||
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
|
- [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
|
||||||
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
|
- [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
|
||||||
- [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K)
|
- [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
|
||||||
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
|
- [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code)
|
||||||
|
|
||||||
The ShareGPT52K dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it.
|
The ShareGPT_Vicuna_unfiltered dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it.
|
||||||
We performed some preprocessing on the original data, with the format as follows:
|
We performed some preprocessing on the original data, with the format as follows:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -3,13 +3,13 @@
|
||||||
# @Author: LiangSong(sl12160010@gmail.com)
|
# @Author: LiangSong(sl12160010@gmail.com)
|
||||||
# @Date: 2023-04-05 23:18:10
|
# @Date: 2023-04-05 23:18:10
|
||||||
# @LastEditors: LiangSong(sl12160010@gmail.com)
|
# @LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
# @LastEditTime: 2023-04-05 23:34:30
|
# @LastEditTime: 2023-05-04 08:24:17
|
||||||
# @FilePath: /Open-Llama/data/download_instruct.sh
|
# @FilePath: /Open-Llama/data/download_instruct.sh
|
||||||
# @Description:
|
# @Description:
|
||||||
#
|
#
|
||||||
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
# Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
|
||||||
###
|
###
|
||||||
mkdir data/instruction_data
|
mkdir data/instruction_data
|
||||||
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json
|
wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json' -O data/sg_90k_part1_html_cleaned.json
|
||||||
curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json
|
wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json' -O data/sg_90k_part2_html_cleaned.json
|
||||||
python3 data/preprocess_instruction.py
|
python3 data/preprocess_instruction.py
|
|
@ -2,7 +2,7 @@
|
||||||
Author: LiangSong(sl12160010@gmail.com)
|
Author: LiangSong(sl12160010@gmail.com)
|
||||||
Date: 2023-03-30 20:52:10
|
Date: 2023-03-30 20:52:10
|
||||||
LastEditors: LiangSong(sl12160010@gmail.com)
|
LastEditors: LiangSong(sl12160010@gmail.com)
|
||||||
LastEditTime: 2023-04-05 23:51:16
|
LastEditTime: 2023-05-04 08:32:04
|
||||||
FilePath: /Open-Llama/data/preprocess_instruction.py
|
FilePath: /Open-Llama/data/preprocess_instruction.py
|
||||||
Description:
|
Description:
|
||||||
|
|
||||||
|
@ -145,9 +145,9 @@ write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
|
||||||
total_num = 0
|
total_num = 0
|
||||||
file_num = 1
|
file_num = 1
|
||||||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
|
||||||
with open("data/sg_90k_part1.json", "r") as fp:
|
with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp:
|
||||||
data1 = json.load(fp)
|
data1 = json.load(fp)
|
||||||
with open("data/sg_90k_part2.json", "r") as fp:
|
with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp:
|
||||||
data2 = json.load(fp)
|
data2 = json.load(fp)
|
||||||
data = data1 + data2
|
data = data1 + data2
|
||||||
for line in data:
|
for line in data:
|
||||||
|
@ -161,7 +161,7 @@ for line in data:
|
||||||
total_num += 1
|
total_num += 1
|
||||||
wfp.close()
|
wfp.close()
|
||||||
print(
|
print(
|
||||||
"RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format(
|
"anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format(
|
||||||
total_num, file_num
|
total_num, file_num
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user