add more instruction data

2023-04-06 03:45:24 +08:00 · 2023-04-06 03:45:24 +08:00 · bc16df4751
commit bc16df4751
parent 9f140dc99f
4 changed files with 125 additions and 19 deletions
--- a/configs/instruction_tuning_config.py
+++ b/configs/instruction_tuning_config.py
@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-30 21:38:07
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-03-30 21:39:40
+LastEditTime: 2023-04-06 03:37:23
 FilePath: /Open-Llama/configs/instruction_tuning_config.py
 Description: 

@ -10,7 +10,7 @@ Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
 """
 max_length = 1024
 train_batch_size = 2
-num_training_steps = 37500
+num_training_steps = 40000
 num_warmup_steps = 100
 initializer_range = 1e-2
 lr = 2e-4
@ -22,4 +22,4 @@ log_interval = 50
 eval_interval = 500
 save_interval = 1000
 work_dir = "data/saved_ckpt/"
-ckpt_path = "data/saved_ckpt/40000.pt"
+ckpt_path = "data/saved_ckpt/83200.pt"
--- a/dataset/data_iter.py
+++ b/dataset/data_iter.py
@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-17 19:32:20
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-04-05 22:36:45
+LastEditTime: 2023-04-06 03:37:55
 FilePath: /Open-Llama/dataset/data_iter.py
 Description: 

@ -68,7 +68,10 @@ class DataIter(IterableDataset):
                    # Transformation, including sample, tokenize, etc.
                    if self.transform_dict:
                        line = self.transform_dict[dataset_name](line)
-                        if isinstance(line, str):
+                        # skip bad doc
+                        if line is None:
+                            continue
+                        elif isinstance(line, str):
                            yield line
                        # must be list of list
                        elif isinstance(line, list) and isinstance(line[0], list):
--- a/dataset/instruction_dataset.py
+++ b/dataset/instruction_dataset.py
@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-30 21:02:00
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-04-05 22:35:24
+LastEditTime: 2023-04-06 03:33:27
 FilePath: /Open-Llama/dataset/instruction_dataset.py
 Description: 

@ -21,7 +21,7 @@ def preprocess_self_instruction_gen(tokenizer, segment_max_length=1024):
        prompt = line["prompt"]
        if prompt.endswith("Output:"):
            prompt = prompt[:-7]
-        total = "user:{}<s>system:{}".format(prompt.strip(), line["completion"].strip())
+        total = "user:{}\nsystem:{}".format(prompt.strip(), line["completion"].strip())
        out = tokenizer(total)
        input_ids = out["input_ids"]
        return [
@ -39,12 +39,12 @@ def preprocess_belle_gen(tokenizer, segment_max_length=1024):
        {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
        Split the data based on the tokenized length according to the maximum length.
        """
-        prompt = line["input"].replace("\\n", "")
+        prompt = line["instruction"].replace("\\n", "")
        prompt = prompt.strip("")

-        completion = line["target"].replace("\\n", "")
+        completion = line["output"].replace("\\n", "")
        completion = completion.strip("")
-        total = "user:{}<s>system:{}".format(prompt, completion)
+        total = "user:{}\nsystem:{}".format(prompt, completion)
        out = tokenizer(total)
        input_ids = out["input_ids"]
        return [
@ -55,9 +55,101 @@ def preprocess_belle_gen(tokenizer, segment_max_length=1024):
    return preprocess_belle


+def preprocess_belle_multiturn_chat_gen(tokenizer, segment_max_length=1024):
+    def preprocess_belle_multiturn_chat(line):
+        """
+        The format of the data is roughly as follows.
+        {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
+        Split the data based on the tokenized length according to the maximum length.
+        """
+        prompt = line["instruction"].replace("\\n", "")
+        prompt = prompt.strip("")
+
+        completion = line["output"].replace("\\n", "")
+        completion = completion.strip("")
+        chats = prompt + completion
+        chats = chats.split("Human:")
+        input_ids = []
+        for chat in chats:
+            if chat.strip() == "":
+                continue
+            res = chat.split("Assistant:")
+            if len(res) != 2:
+                continue
+            prompt, completion = res
+            prompt = prompt.strip()
+            completion = completion.strip()
+            chat = "user:{}\nsystem:{}".format(prompt, completion)
+            out = tokenizer(chat)
+            input_ids.extend(out["input_ids"])
+        if len(input_ids) == 0:
+            return None
+        return [
+            input_ids[i * segment_max_length : (i + 1) * segment_max_length]
+            for i in range(math.ceil(len(input_ids) / segment_max_length))
+        ]
+
+    return preprocess_belle_multiturn_chat
+
+
+def preprocess_sharegpt_gen(tokenizer, segment_max_length=1024):
+    def preprocess_sharegpt(line):
+        """
+        The format of the data is roughly as follows.
+        {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
+        Split the data based on the tokenized length according to the maximum length.
+        """
+        chats = line["conversations"]
+        if chats[0]["from"] != "human":
+            chats = chats[1:]
+        input_ids = []
+        for i in range(len(chats) // 2):
+            prompt = chats[2 * i]
+            completion = chats[2 * i + 1]
+            if not (prompt["from"] == "human" and completion["from"] == "gpt"):
+                continue
+            prompt = prompt["value"]
+            prompt = prompt.strip()
+            completion = completion["value"]
+            completion = completion.strip()
+            chat = "user:{}\nsystem:{}".format(prompt, completion)
+            out = tokenizer(chat)
+            input_ids.extend(out["input_ids"])
+        if input_ids == []:
+            return None
+        return [
+            input_ids[i * segment_max_length : (i + 1) * segment_max_length]
+            for i in range(math.ceil(len(input_ids) / segment_max_length))
+        ]
+
+    return preprocess_sharegpt
+
+
+def preprocess_instruct_code_gen(tokenizer, segment_max_length=1024):
+    def preprocess_instruct_code(line):
+        """
+        The format of the data is roughly as follows.
+        {'text': 'some text', 'meta': {'pile_set_name': 'Github'}}
+        Split the data based on the tokenized length according to the maximum length.
+        """
+        prompt = line["instruction"].replace("\\n", "")
+        prompt = prompt.strip("")
+
+        completion = line["answer"].replace("\\n", "")
+        completion = completion.strip("")
+        total = "user:{}\nsystem:{}".format(prompt, completion)
+        out = tokenizer(total)
+        input_ids = out["input_ids"]
+        return [
+            input_ids[i * segment_max_length : (i + 1) * segment_max_length]
+            for i in range(math.ceil(len(input_ids) / segment_max_length))
+        ]
+
+    return preprocess_instruct_code
+
+
 if __name__ == "__main__":
    import sentencepiece as spm
-    from datasets import IterableDataset

    from dataset.tokenizer import Tokenizer
    from dataset.data_iter import create_shard_kwargs, DataIter
@ -66,17 +158,21 @@ if __name__ == "__main__":
        model_file="configs/10w_vocab_wudao5_pile10.model"
    )
    tokenizer = Tokenizer(sp_model)
-    patterns = ["data/instruction_data/part-belle_1M*.jsonl.zst"]
+    patterns = ["data/instruction_data/part-belle_multiturn_chat_0.8M-*.jsonl.zst"]
    paths = create_shard_kwargs(patterns)
    transform_dict = {
+        "self_instruct": preprocess_self_instruction_gen(tokenizer),
        "belle_1M": preprocess_belle_gen(tokenizer),
        "belle_0.5M": preprocess_belle_gen(tokenizer),
-        "self_instruct": preprocess_self_instruction_gen(tokenizer),
+        "belle_school_math_0.25M": preprocess_belle_gen(tokenizer),
+        "belle_multiturn_chat_0.8M": preprocess_belle_multiturn_chat_gen(tokenizer),
+        "instruct_to_code": preprocess_instruct_code_gen(tokenizer),
+        "sharegpt_90K": preprocess_sharegpt_gen(tokenizer),
    }
    data_set = DataIter(
        paths, transform_dict=transform_dict, concat_docs=True, max_length=1024
    )
    for i, sample in enumerate(data_set):
-        print(sample, sp_model.Decode(sample))
-        if i == 20:
+        print(sp_model.decode(sample))
+        if i == 1:
            break
--- a/inctruction_tuning.py
+++ b/inctruction_tuning.py
@ -2,7 +2,7 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-03-30 21:35:01
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-04-05 22:47:25
+LastEditTime: 2023-04-06 03:35:31
 FilePath: /Open-Llama/inctruction_tuning.py
 Description: 

@ -27,6 +27,9 @@ from dataset.collate_fn import collate_fn_gen
 from dataset.instruction_dataset import (
    preprocess_belle_gen,
    preprocess_self_instruction_gen,
+    preprocess_belle_multiturn_chat_gen,
+    preprocess_instruct_code_gen,
+    preprocess_sharegpt_gen,
 )
 from configs.instruction_tuning_config import *

@ -45,9 +48,13 @@ tokenizer = Tokenizer(sp_model)
 paths = create_shard_kwargs(patterns, repeat=3)
 random.shuffle(paths)
 transform_dict = {
-    "belle_1M": preprocess_belle_gen(tokenizer, max_length),
-    "belle_0.5M": preprocess_belle_gen(tokenizer, max_length),
-    "self_instruct": preprocess_self_instruction_gen(tokenizer, max_length),
+    "self_instruct": preprocess_self_instruction_gen(tokenizer),
+    "belle_1M": preprocess_belle_gen(tokenizer),
+    "belle_0.5M": preprocess_belle_gen(tokenizer),
+    "belle_school_math_0.25M": preprocess_belle_gen(tokenizer),
+    "belle_multiturn_chat_0.8M": preprocess_belle_multiturn_chat_gen(tokenizer),
+    "instruct_to_code": preprocess_instruct_code_gen(tokenizer),
+    "sharegpt_90K": preprocess_sharegpt_gen(tokenizer),
 }
 data_set = DataIter(
    paths,