update preprocess format
This commit is contained in:
		
							parent
							
								
									85caa97a6a
								
							
						
					
					
						commit
						d24b4cce54
					
				|  | @ -15,221 +15,54 @@ from datasets import load_dataset | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| root_dir = "data" | root_dir = "data" | ||||||
|  | write_path = "data/instruction_data/part-{}-{}.jsonl.zst" | ||||||
|  | dataset_map = { | ||||||
|  |     "yizhongw/self_instruct": "self_instruct", | ||||||
|  |     "BelleGroup/train_0.5M_CN": "belle_0.5M", | ||||||
|  |     "BelleGroup/train_1M_CN": "belle_1M", | ||||||
|  |     "BelleGroup/train_2M_CN": "belle_2M", | ||||||
|  |     "BelleGroup/school_math_0.25M": "belle_school_math_0.25M", | ||||||
|  |     "BelleGroup/multiturn_chat_0.8M": "belle_multiturn_chat_0.8M", | ||||||
|  |     "Graverman/Instruct-to-Code": "instruct_to_code", | ||||||
|  |     ("bigscience/xP3mt", "code"): "xP3mt_code", | ||||||
|  |     ("bigscience/xP3mt", "zh"): "xP3mt_zh", | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| dataset = load_dataset("yizhongw/self_instruct") | 
 | ||||||
| write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst" | def process_hf_dataset(name, local_name): | ||||||
| total_num = 0 |     if isinstance(name, str): | ||||||
| file_num = 1 |         dataset = load_dataset(name) | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |     else: | ||||||
| for line in tqdm(dataset["train"]): |         dataset = load_dataset(*name) | ||||||
|     line = json.dumps(line) |     total_num = 0 | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |     file_num = 1 | ||||||
|         file_num += 1 |     wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8") | ||||||
|         wfp.close() |     for line in tqdm(dataset["train"]): | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |         line = json.dumps(line) | ||||||
|     wfp.write(line.encode("utf-8")) |         if total_num % 1024 == 0 and total_num > 0: | ||||||
|     wfp.write(b"\n") |             file_num += 1 | ||||||
|     total_num += 1 |             wfp.close() | ||||||
| wfp.close() |             wfp = zstd.open( | ||||||
| print( |                 write_path.format(local_name, file_num), "wb", encoding="utf-8" | ||||||
|     "yizhongw/self_instruct preprocess done. Total line: {}, Total file: {}".format( |             ) | ||||||
|         total_num, file_num |         wfp.write(line.encode("utf-8")) | ||||||
|  |         wfp.write(b"\n") | ||||||
|  |         total_num += 1 | ||||||
|  |     wfp.close() | ||||||
|  |     print( | ||||||
|  |         "{} preprocess done. Total line: {}, Total file: {}".format( | ||||||
|  |             name, total_num, file_num | ||||||
|  |         ) | ||||||
|     ) |     ) | ||||||
| ) |  | ||||||
| 
 | 
 | ||||||
| dataset = load_dataset("BelleGroup/train_0.5M_CN") | 
 | ||||||
| write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst" | for k, v in dataset_map.items(): | ||||||
|  |     process_hf_dataset(k, v) | ||||||
|  | 
 | ||||||
|  | local_name = "sharegpt_90K" | ||||||
| total_num = 0 | total_num = 0 | ||||||
| file_num = 1 | file_num = 1 | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") | wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8") | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "BelleGroup/train_0.5M_CN preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("BelleGroup/train_1M_CN") |  | ||||||
| write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "BelleGroup/train_1M_CN preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("BelleGroup/train_2M_CN") |  | ||||||
| write_path = root_dir + "/instruction_data/part-belle_2M-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "BelleGroup/train_2M_CN preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("BelleGroup/school_math_0.25M") |  | ||||||
| write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "BelleGroup/school_math_0.25M preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("BelleGroup/multiturn_chat_0.8M") |  | ||||||
| write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "BelleGroup/multiturn_chat_0.8M preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("Graverman/Instruct-to-Code") |  | ||||||
| write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "Graverman/Instruct-to-Code preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| # dataset = load_dataset("bigscience/xP3mt", "en") |  | ||||||
| # write_path = root_dir + "/instruction_data/part-bigscience/xP3mt_en-{}.jsonl.zst" |  | ||||||
| # total_num = 0 |  | ||||||
| # file_num = 1 |  | ||||||
| # wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| # for line in tqdm(dataset["train"]): |  | ||||||
| #     line = json.dumps(line) |  | ||||||
| #     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
| #         file_num += 1 |  | ||||||
| #         wfp.close() |  | ||||||
| #         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| #     wfp.write(line.encode("utf-8")) |  | ||||||
| #     wfp.write(b"\n") |  | ||||||
| #     total_num += 1 |  | ||||||
| # wfp.close() |  | ||||||
| # print( |  | ||||||
| #     "bigscience/xP3mt_en preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
| #         total_num, file_num |  | ||||||
| #     ) |  | ||||||
| # ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("bigscience/xP3mt", "code") |  | ||||||
| write_path = root_dir + "/instruction_data/part-xP3mt_code-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "bigscience/xP3mt_code preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dataset = load_dataset("bigscience/xP3mt", "zh") |  | ||||||
| write_path = root_dir + "/instruction_data/part-xP3mt_zh-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| for line in tqdm(dataset["train"]): |  | ||||||
|     line = json.dumps(line) |  | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |  | ||||||
|         file_num += 1 |  | ||||||
|         wfp.close() |  | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
|     wfp.write(line.encode("utf-8")) |  | ||||||
|     wfp.write(b"\n") |  | ||||||
|     total_num += 1 |  | ||||||
| wfp.close() |  | ||||||
| print( |  | ||||||
|     "bigscience/xP3mt_zh preprocess done. Total line: {}, Total file: {}".format( |  | ||||||
|         total_num, file_num |  | ||||||
|     ) |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst" |  | ||||||
| total_num = 0 |  | ||||||
| file_num = 1 |  | ||||||
| wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |  | ||||||
| with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp: | with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp: | ||||||
|     data1 = json.load(fp) |     data1 = json.load(fp) | ||||||
| with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp: | with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp: | ||||||
|  | @ -240,7 +73,7 @@ for line in tqdm(data): | ||||||
|     if total_num % 1024 == 0 and total_num > 0: |     if total_num % 1024 == 0 and total_num > 0: | ||||||
|         file_num += 1 |         file_num += 1 | ||||||
|         wfp.close() |         wfp.close() | ||||||
|         wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") |         wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8") | ||||||
|     wfp.write(line.encode("utf-8")) |     wfp.write(line.encode("utf-8")) | ||||||
|     wfp.write(b"\n") |     wfp.write(b"\n") | ||||||
|     total_num += 1 |     total_num += 1 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 LiangSong
						LiangSong