Open-Llama/utils/merge_tokenizer.py

24 lines
787 B
Python
Raw Normal View History

2023-04-12 09:15:40 +00:00
from tqdm import tqdm
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as model
raw_model = model.ModelProto()
2023-04-12 14:16:15 +00:00
raw_model.ParseFromString(open("configs/llama_tokenizer.model", "rb").read())
2023-04-12 09:15:40 +00:00
exist_pieces = set([p.piece for p in raw_model.pieces])
cn_model = model.ModelProto()
2023-04-12 14:16:15 +00:00
cn_model.ParseFromString(open("configs/4w_cn_vocab_wudao15.model", "rb").read())
2023-04-12 09:15:40 +00:00
for p in tqdm(cn_model.pieces, total=len(cn_model.pieces)):
if p.piece not in exist_pieces:
raw_model.pieces.append(p)
2023-04-12 14:16:15 +00:00
with open("configs/llama_tokenizer_extended.model", "wb") as f:
2023-04-12 09:15:40 +00:00
f.write(raw_model.SerializeToString())
sp_model = spm.SentencePieceProcessor(
model_file="configs/llama_tokenizer_extended.model"
)
2023-04-12 14:16:15 +00:00
print("merged vocab size: {}".format(sp_model.vocab_size()))