Open-Llama/dataset/data_iter.py

'''
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-03-17 19:32:20
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-03-26 23:03:32
FilePath: /Open-Llama/dataset/data_iter.py
Description:

Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
'''
import json
from glob import glob
import zstandard as zstd


def create_data_iter(paths, transform_dict=None, process_index=0, num_processes=1):
    '''
    Currently, the allowed storage formats are jsonl and jsonl.zst.
    Each line of the data is a dictionary, which can be parsed as JSON for subsequent processing after reading.
    '''
    past = None
    for i, path in paths:
        dataset_name = path.split('-')[-2]
        if past != dataset_name:
            print('Loading data from {}'.format(path))
            past = path
        if num_processes > 1 and i % num_processes != process_index:
            continue
        if path.endswith('jsonl.zst'):
             with zstd.open(path, 'r', encoding='utf-8') as fp:
                 for line in fp:
                    if isinstance(line, bytes):
                        line = line.decode('utf-8')
                    line = json.loads(line)
                    line['dataset'] = dataset_name
                    if transform_dict:
                        line = transform_dict[dataset_name](line)
                        if isinstance(line, str):
                            yield line
                        elif isinstance(line, list):
                            for i in line:
                                yield i
                        else:
                            raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
                    else:
                        yield line
        elif path.endswith('jsonl'):
            with open(path, 'r') as fp:
                for line in fp:
                    if isinstance(line, bytes):
                        line = line.decode('utf-8')
                    line = json.loads(line)
                    line['dataset'] = dataset_name
                    if transform_dict:
                        line = transform_dict[dataset_name](line)
                        if isinstance(line, str):
                            yield line
                        elif isinstance(line, list):
                            for i in line:
                                yield i
                        else:
                            raise Exception('Unsupported type in Transformation: {}'.format(transform_dict[dataset_name]))
                    else:
                        yield line
        else:
            raise Exception('File format of {} is not supported yet.'.format(path))

def create_shard_kwargs(patterns, repeat=1):
    '''
    Assign numbers to different shards of data to ensure that data is not duplicated
    when allocated to different nodes during distributed training.
    '''
    all_path = []
    for p in patterns:
        all_path.extend(glob(p))
    all_path *= repeat
    return [(i, p) for i, p in enumerate(all_path)]

if __name__ == '__main__':
    patterns = [
        'data/pretrain_data/part-wudao*.jsonl.zst'
    ]
    paths = create_shard_kwargs(patterns)
    transform_dict = {
        'wudao': lambda x: x['title'],
        'pile': lambda x: [x['text']]
    }
    data_iter = create_data_iter(paths, transform_dict=transform_dict)
    for i, data in enumerate(data_iter):
        print(i, data)
        if i == 20:
            break