update utils

This commit is contained in:
LiangSong 2023-04-12 17:15:40 +08:00
parent da1c927016
commit ae0691c509
5 changed files with 43 additions and 0 deletions

Binary file not shown.

20
utils/convert_ckpt.py Normal file
View File

@ -0,0 +1,20 @@
import torch
import sentencepiece as spm
sp_model = spm.SentencePieceProcessor(
model_file="configs/llama_tokenizer_extended.model"
)
merged_vocab_size = sp_model.vocab_size()
ckpt = torch.load('data/llama_raw_ckpt/7B/consolidated.00.pth')
raw_vocab_size, hidden_size = ckpt['tok_embeddings.weight'].shape
extended_tok_embeddings = torch.randn(merged_vocab_size - raw_vocab_size, hidden_size)
extended_tok_embeddings = extended_tok_embeddings * 0.001
ckpt['tok_embeddings.weight'] = torch.cat([ckpt['tok_embeddings.weight'], extended_tok_embeddings], dim=0)
extended_out_embeddings = torch.randn(merged_vocab_size - raw_vocab_size, hidden_size)
extended_out_embeddings = extended_out_embeddings * 0.001
ckpt['output.weight'] = torch.cat([ckpt['output.weight'], extended_out_embeddings], dim=0)
torch.save(ckpt, 'data/llama_raw_ckpt/7B/extended.pth')

23
utils/merge_tokenizer.py Normal file
View File

@ -0,0 +1,23 @@
from tqdm import tqdm
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as model
raw_model = model.ModelProto()
raw_model.ParseFromString(open('configs/llama_tokenizer.model', 'rb').read())
exist_pieces = set([p.piece for p in raw_model.pieces])
cn_model = model.ModelProto()
cn_model.ParseFromString(open('configs/4w_cn_vocab_wudao15.model', 'rb').read())
for p in tqdm(cn_model.pieces, total=len(cn_model.pieces)):
if p.piece not in exist_pieces:
raw_model.pieces.append(p)
with open('configs/llama_tokenizer_extended.model', 'wb') as f:
f.write(raw_model.SerializeToString())
sp_model = spm.SentencePieceProcessor(
model_file="configs/llama_tokenizer_extended.model"
)
print('merged vocab size: {}'.format(sp_model.vocab_size()))