| | from functools import partial |
| |
|
| | from litgpt.tokenizer import Tokenizer |
| | from litdata import optimize, TokensLoader, StreamingDataset |
| | from transformers import AutoTokenizer |
| |
|
| | from utils import tokenize_fn |
| | from core_base_datasets import core_base_datasets |
| | from core_instruct_datasets import core_instruct_datasets |
| |
|
| | seqs = [ |
| | (0, 1048576, 2049, 8000), |
| | (2049, 8193, 8193, 2000), |
| | (8193, 1048577, 32769, 500), |
| | ] |
| |
|
| | |
| | |
| | |
| | for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs): |
| | chunk_size = block_size * subchunk_size |
| | output_dir = f'../core-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}' |
| |
|
| | outputs = optimize( |
| | fn=partial( |
| | tokenize_fn, |
| | min_len=min_len, |
| | max_len=max_len, |
| | hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True), |
| | tokenizer=Tokenizer('..'), |
| | ), |
| | inputs=core_base_datasets + core_instruct_datasets, |
| | output_dir=output_dir, |
| | chunk_size=chunk_size, |
| | num_workers=32, |
| | reorder_files=False, |
| | |
| | |
| | |
| | ) |
| |
|
| | |
| | |
| | |
| | for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs): |
| | chunk_size = block_size * subchunk_size |
| | input_dir = f'../core-data-{i}-{min_len}-{max_len}-{block_size}-{subchunk_size}' |
| |
|
| | dataset = StreamingDataset( |
| | input_dir=input_dir, |
| | item_loader=TokensLoader(block_size=block_size), |
| | ) |
| |
|
| | print(f'{i=}, {min_len=}, {max_len=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}') |
| |
|
| | |
| | |
| | total_tokens = len(dataset) * block_size |
| | print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}') |
| |
|