Open-Llama/utils/train_tokenizer.py

"""
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-24 20:49:03
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-05-06 23:34:14
FilePath: /Open-Llama/utils/train_tokenizer.py
Description:

Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import random
from glob import glob
from datasets import load_dataset


random.seed(42)

wudao_pattern = "data/pretrain_data/part-wudao-*.jsonl.zst"
wudao_paths = glob(wudao_pattern)
random.shuffle(wudao_paths)

pile_pattern = "data/pretrain_data/part-pile-*.jsonl.zst"
pile_paths = glob(pile_pattern)
random.shuffle(pile_paths)

paths = wudao_paths[:5] + pile_paths[:10]

dataset = load_dataset("json", data_files=paths, split="train", streaming=True)
dataset = dataset.shuffle(seed=42)


def transform(dataset):
    for line in dataset:
        if "title" in line and "content" in line:
            yield line["title"] + "\n" + line["content"]
        else:
            yield line["text"]


data_iter = transform(dataset)

import io
import sentencepiece as spm

# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
    sentence_iterator=data_iter,
    model_writer=model,
    shuffle_input_sentence=False,
    train_extremely_large_corpus=True,
    # hyperparameters of tokenizer
    max_sentence_length=16384,
    pad_id=3,
    model_type="BPE",
    vocab_size=100000,
    # split digits and fallback to byte same as Llama.
    # set split_by_unicode_script to True to avoid grouping punctuation and characters together.
    split_digits=True,
    split_by_unicode_script=True,
    byte_fallback=True,
    # reserve whitespace and \n and \t etc. for code generation
    allow_whitespace_only_pieces=True,
    remove_extra_whitespaces=False,
    # Llama use identity instead of nfkc
    normalization_rule_name="nfkc",
)

# Serialize the model as file.
with open("configs/tokenizer_models/10w_vocab_wudao5_pile10.model", "wb") as f:
    f.write(model.getvalue())

# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.decode(sp.encode("只因你太美🤗▃     \n  1")))