From dba2e2d680623462e6b5d559b82dd0979c816451 Mon Sep 17 00:00:00 2001 From: LiangSong Date: Thu, 4 May 2023 08:34:38 +0800 Subject: [PATCH] update ShareGPT_90K preprocess --- README.md | 6 +++--- README_en.md | 6 +++--- data/download_instruct.sh | 6 +++--- data/preprocess_instruction.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 61fb6f3..46ad9de 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ * @Author: LiangSong(sl12160010@gmail.com) * @Date: 2023-03-10 21:18:35 * @LastEditors: LiangSong(sl12160010@gmail.com) - * @LastEditTime: 2023-04-29 20:29:31 + * @LastEditTime: 2023-05-04 08:33:26 * @FilePath: /Open-Llama/README.md * @Description: * @@ -252,10 +252,10 @@ Total mult-adds (G): 7.04 - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) -- [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K) +- [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) -其中ShareGPT52K数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。 +其中ShareGPT_Vicuna_unfiltered数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。 我们对原始数据进行了一些预处理,格式如下 ``` user: {prompt}\nsystem: {completion} diff --git a/README_en.md b/README_en.md index b6e2f60..03704d1 100644 --- a/README_en.md +++ b/README_en.md @@ -2,7 +2,7 @@ * @Author: LiangSong(sl12160010@gmail.com) * @Date: 2023-03-10 21:18:35 * @LastEditors: LiangSong(sl12160010@gmail.com) - * @LastEditTime: 2023-04-29 20:30:12 + * @LastEditTime: 2023-05-04 08:33:45 * @FilePath: /Open-Llama/README_en.md * @Description: * @@ -266,10 +266,10 @@ We use the currently available seven datasets for Instruction-tuning, and more t - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) -- [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K) +- [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) -The ShareGPT52K dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it. +The ShareGPT_Vicuna_unfiltered dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it. We performed some preprocessing on the original data, with the format as follows: ``` diff --git a/data/download_instruct.sh b/data/download_instruct.sh index b916bd6..e781158 100644 --- a/data/download_instruct.sh +++ b/data/download_instruct.sh @@ -3,13 +3,13 @@ # @Author: LiangSong(sl12160010@gmail.com) # @Date: 2023-04-05 23:18:10 # @LastEditors: LiangSong(sl12160010@gmail.com) - # @LastEditTime: 2023-04-05 23:34:30 + # @LastEditTime: 2023-05-04 08:24:17 # @FilePath: /Open-Llama/data/download_instruct.sh # @Description: # # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. ### mkdir data/instruction_data -curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json -curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json +wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json' -O data/sg_90k_part1_html_cleaned.json +wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json' -O data/sg_90k_part2_html_cleaned.json python3 data/preprocess_instruction.py \ No newline at end of file diff --git a/data/preprocess_instruction.py b/data/preprocess_instruction.py index 506120b..a096a34 100644 --- a/data/preprocess_instruction.py +++ b/data/preprocess_instruction.py @@ -2,7 +2,7 @@ Author: LiangSong(sl12160010@gmail.com) Date: 2023-03-30 20:52:10 LastEditors: LiangSong(sl12160010@gmail.com) -LastEditTime: 2023-04-05 23:51:16 +LastEditTime: 2023-05-04 08:32:04 FilePath: /Open-Llama/data/preprocess_instruction.py Description: @@ -145,9 +145,9 @@ write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst" total_num = 0 file_num = 1 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") -with open("data/sg_90k_part1.json", "r") as fp: +with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp: data1 = json.load(fp) -with open("data/sg_90k_part2.json", "r") as fp: +with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp: data2 = json.load(fp) data = data1 + data2 for line in data: @@ -161,7 +161,7 @@ for line in data: total_num += 1 wfp.close() print( - "RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format( + "anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format( total_num, file_num ) )