update ShareGPT_90K preprocess
This commit is contained in:
		
							parent
							
								
									154456c976
								
							
						
					
					
						commit
						dba2e2d680
					
				|  | @ -2,7 +2,7 @@ | ||||||
|  * @Author: LiangSong(sl12160010@gmail.com) |  * @Author: LiangSong(sl12160010@gmail.com) | ||||||
|  * @Date: 2023-03-10 21:18:35 |  * @Date: 2023-03-10 21:18:35 | ||||||
|  * @LastEditors: LiangSong(sl12160010@gmail.com) |  * @LastEditors: LiangSong(sl12160010@gmail.com) | ||||||
|  * @LastEditTime: 2023-04-29 20:29:31 |  * @LastEditTime: 2023-05-04 08:33:26 | ||||||
|  * @FilePath: /Open-Llama/README.md |  * @FilePath: /Open-Llama/README.md | ||||||
|  * @Description:  |  * @Description:  | ||||||
|  *  |  *  | ||||||
|  | @ -252,10 +252,10 @@ Total mult-adds (G): 7.04 | ||||||
| - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) | - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) | ||||||
| - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) | - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) | ||||||
| - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) | - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) | ||||||
| - [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K) | - [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) | ||||||
| - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) | - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) | ||||||
| 
 | 
 | ||||||
| 其中ShareGPT52K数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。 | 其中ShareGPT_Vicuna_unfiltered数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。 | ||||||
| 我们对原始数据进行了一些预处理,格式如下 | 我们对原始数据进行了一些预处理,格式如下 | ||||||
| ``` | ``` | ||||||
| user: {prompt}\nsystem: {completion}</s> | user: {prompt}\nsystem: {completion}</s> | ||||||
|  |  | ||||||
|  | @ -2,7 +2,7 @@ | ||||||
|  * @Author: LiangSong(sl12160010@gmail.com) |  * @Author: LiangSong(sl12160010@gmail.com) | ||||||
|  * @Date: 2023-03-10 21:18:35 |  * @Date: 2023-03-10 21:18:35 | ||||||
|  * @LastEditors: LiangSong(sl12160010@gmail.com) |  * @LastEditors: LiangSong(sl12160010@gmail.com) | ||||||
|  * @LastEditTime: 2023-04-29 20:30:12 |  * @LastEditTime: 2023-05-04 08:33:45 | ||||||
|  * @FilePath: /Open-Llama/README_en.md |  * @FilePath: /Open-Llama/README_en.md | ||||||
|  * @Description:  |  * @Description:  | ||||||
|  *  |  *  | ||||||
|  | @ -266,10 +266,10 @@ We use the currently available seven datasets for Instruction-tuning, and more t | ||||||
| - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) | - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) | ||||||
| - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) | - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) | ||||||
| - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) | - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) | ||||||
| - [RyokoAI/ShareGPT52K](https://huggingface.co/datasets/RyokoAI/ShareGPT52K) | - [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) | ||||||
| - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) | - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) | ||||||
| 
 | 
 | ||||||
| The ShareGPT52K dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it. | The ShareGPT_Vicuna_unfiltered dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it. | ||||||
| We performed some preprocessing on the original data, with the format as follows: | We performed some preprocessing on the original data, with the format as follows: | ||||||
| 
 | 
 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | @ -3,13 +3,13 @@ | ||||||
|  # @Author: LiangSong(sl12160010@gmail.com) |  # @Author: LiangSong(sl12160010@gmail.com) | ||||||
|  # @Date: 2023-04-05 23:18:10 |  # @Date: 2023-04-05 23:18:10 | ||||||
|  # @LastEditors: LiangSong(sl12160010@gmail.com) |  # @LastEditors: LiangSong(sl12160010@gmail.com) | ||||||
|  # @LastEditTime: 2023-04-05 23:34:30 |  # @LastEditTime: 2023-05-04 08:24:17 | ||||||
|  # @FilePath: /Open-Llama/data/download_instruct.sh |  # @FilePath: /Open-Llama/data/download_instruct.sh | ||||||
|  # @Description:  |  # @Description:  | ||||||
|  #  |  #  | ||||||
|  # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.  |  # Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.  | ||||||
| ###  | ###  | ||||||
| mkdir data/instruction_data | mkdir data/instruction_data | ||||||
| curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part1.json' -o data/sg_90k_part1.json | wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json' -O data/sg_90k_part1_html_cleaned.json | ||||||
| curl -C - --retry 3 'https://huggingface.co/datasets/RyokoAI/ShareGPT52K/resolve/main/sg_90k_part2.json' -o data/sg_90k_part2.json | wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json' -O data/sg_90k_part2_html_cleaned.json | ||||||
| python3 data/preprocess_instruction.py | python3 data/preprocess_instruction.py | ||||||
|  | @ -2,7 +2,7 @@ | ||||||
| Author: LiangSong(sl12160010@gmail.com) | Author: LiangSong(sl12160010@gmail.com) | ||||||
| Date: 2023-03-30 20:52:10 | Date: 2023-03-30 20:52:10 | ||||||
| LastEditors: LiangSong(sl12160010@gmail.com) | LastEditors: LiangSong(sl12160010@gmail.com) | ||||||
| LastEditTime: 2023-04-05 23:51:16 | LastEditTime: 2023-05-04 08:32:04 | ||||||
| FilePath: /Open-Llama/data/preprocess_instruction.py | FilePath: /Open-Llama/data/preprocess_instruction.py | ||||||
| Description:  | Description:  | ||||||
| 
 | 
 | ||||||
|  | @ -145,9 +145,9 @@ write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst" | ||||||
| total_num = 0 | total_num = 0 | ||||||
| file_num = 1 | file_num = 1 | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") | wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") | ||||||
| with open("data/sg_90k_part1.json", "r") as fp: | with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp: | ||||||
|     data1 = json.load(fp) |     data1 = json.load(fp) | ||||||
| with open("data/sg_90k_part2.json", "r") as fp: | with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp: | ||||||
|     data2 = json.load(fp) |     data2 = json.load(fp) | ||||||
| data = data1 + data2 | data = data1 + data2 | ||||||
| for line in data: | for line in data: | ||||||
|  | @ -161,7 +161,7 @@ for line in data: | ||||||
|     total_num += 1 |     total_num += 1 | ||||||
| wfp.close() | wfp.close() | ||||||
| print( | print( | ||||||
|     "RyokoAI/ShareGPT52K preprocess done. Total line: {}, Total file: {}".format( |     "anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format( | ||||||
|         total_num, file_num |         total_num, file_num | ||||||
|     ) |     ) | ||||||
| ) | ) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 LiangSong
						LiangSong