minipile-unigram-32k-100k / tokenizer_config.json
ksopyla's picture
Upload tokenizer
b5956f7 verified
{
"added_tokens_decoder": {
"0": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "<cls>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<sep>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4": {
"content": "<mask>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"5": {
"content": "<|system|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"6": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"7": {
"content": "<|assistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"8": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"9": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"10": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"11": {
"content": "<|tool_call|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"12": {
"content": "<|tool_response|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"13": {
"content": "<|thought|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"14": {
"content": "<|image|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"15": {
"content": "<|audio|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"16": {
"content": "<|unused0|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"17": {
"content": "<|unused1|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"18": {
"content": "<|unused2|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"19": {
"content": "<|unused3|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"20": {
"content": "<|unused4|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"21": {
"content": "<|unused5|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"22": {
"content": "<|unused6|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"23": {
"content": "<|unused7|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"24": {
"content": "<|unused8|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"25": {
"content": "<|unused9|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"26": {
"content": "<|unused10|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"27": {
"content": "<|unused11|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"28": {
"content": "<|unused12|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"29": {
"content": "<|unused13|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"30": {
"content": "<|unused14|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"31": {
"content": "<|unused15|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32": {
"content": "<|unused16|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"33": {
"content": "<|unused17|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"34": {
"content": "<|unused18|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"35": {
"content": "<|unused19|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"36": {
"content": "<|unused20|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"37": {
"content": "<|unused21|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"38": {
"content": "<|unused22|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"39": {
"content": "<|unused23|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"40": {
"content": "<|unused24|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"41": {
"content": "<|unused25|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"42": {
"content": "<|unused26|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"43": {
"content": "<|unused27|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"44": {
"content": "<|unused28|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"45": {
"content": "<|unused29|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"46": {
"content": "<|unused30|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"47": {
"content": "<|unused31|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"48": {
"content": "<|unused32|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"49": {
"content": "<|unused33|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"50": {
"content": "<|unused34|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"51": {
"content": "<|unused35|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52": {
"content": "<|unused36|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"53": {
"content": "<|unused37|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"54": {
"content": "<|unused38|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"55": {
"content": "<|unused39|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"56": {
"content": "<|unused40|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"57": {
"content": "<|unused41|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"58": {
"content": "<|unused42|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"59": {
"content": "<|unused43|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"60": {
"content": "<|unused44|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"61": {
"content": "<|unused45|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"62": {
"content": "<|unused46|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"63": {
"content": "<|unused47|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"64": {
"content": "<|unused48|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"65": {
"content": "<|unused49|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"66": {
"content": "<|unused50|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"67": {
"content": "<|unused51|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"68": {
"content": "<|unused52|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"69": {
"content": "<|unused53|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"70": {
"content": "<|unused54|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"71": {
"content": "<|unused55|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"72": {
"content": "<|unused56|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"73": {
"content": "<|unused57|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"74": {
"content": "<|unused58|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"75": {
"content": "<|unused59|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"76": {
"content": "<|unused60|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"77": {
"content": "<|unused61|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"78": {
"content": "<|unused62|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"79": {
"content": "<|unused63|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"80": {
"content": "<|unused64|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"81": {
"content": "<|unused65|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"82": {
"content": "<|unused66|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"83": {
"content": "<|unused67|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"84": {
"content": "<|unused68|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"85": {
"content": "<|unused69|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"86": {
"content": "<|unused70|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"87": {
"content": "<|unused71|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"88": {
"content": "<|unused72|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"89": {
"content": "<|unused73|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"90": {
"content": "<|unused74|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"91": {
"content": "<|unused75|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"92": {
"content": "<|unused76|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"93": {
"content": "<|unused77|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"94": {
"content": "<|unused78|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"95": {
"content": "<|unused79|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"96": {
"content": "<|unused80|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"97": {
"content": "<|unused81|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"98": {
"content": "<|unused82|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"99": {
"content": "<|unused83|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"100": {
"content": "<|unused84|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"101": {
"content": "<|unused85|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"102": {
"content": "<|unused86|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"103": {
"content": "<|unused87|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"104": {
"content": "<|unused88|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"105": {
"content": "<|unused89|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"106": {
"content": "<|unused90|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"107": {
"content": "<|unused91|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"108": {
"content": "<|unused92|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"109": {
"content": "<|unused93|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"110": {
"content": "<|unused94|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"111": {
"content": "<|unused95|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"112": {
"content": "<|unused96|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"113": {
"content": "<|unused97|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"114": {
"content": "<|unused98|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"115": {
"content": "<|unused99|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>",
"<|tool_call|>",
"<|tool_response|>",
"<|thought|>",
"<|image|>",
"<|audio|>",
"<|unused0|>",
"<|unused1|>",
"<|unused2|>",
"<|unused3|>",
"<|unused4|>",
"<|unused5|>",
"<|unused6|>",
"<|unused7|>",
"<|unused8|>",
"<|unused9|>",
"<|unused10|>",
"<|unused11|>",
"<|unused12|>",
"<|unused13|>",
"<|unused14|>",
"<|unused15|>",
"<|unused16|>",
"<|unused17|>",
"<|unused18|>",
"<|unused19|>",
"<|unused20|>",
"<|unused21|>",
"<|unused22|>",
"<|unused23|>",
"<|unused24|>",
"<|unused25|>",
"<|unused26|>",
"<|unused27|>",
"<|unused28|>",
"<|unused29|>",
"<|unused30|>",
"<|unused31|>",
"<|unused32|>",
"<|unused33|>",
"<|unused34|>",
"<|unused35|>",
"<|unused36|>",
"<|unused37|>",
"<|unused38|>",
"<|unused39|>",
"<|unused40|>",
"<|unused41|>",
"<|unused42|>",
"<|unused43|>",
"<|unused44|>",
"<|unused45|>",
"<|unused46|>",
"<|unused47|>",
"<|unused48|>",
"<|unused49|>",
"<|unused50|>",
"<|unused51|>",
"<|unused52|>",
"<|unused53|>",
"<|unused54|>",
"<|unused55|>",
"<|unused56|>",
"<|unused57|>",
"<|unused58|>",
"<|unused59|>",
"<|unused60|>",
"<|unused61|>",
"<|unused62|>",
"<|unused63|>",
"<|unused64|>",
"<|unused65|>",
"<|unused66|>",
"<|unused67|>",
"<|unused68|>",
"<|unused69|>",
"<|unused70|>",
"<|unused71|>",
"<|unused72|>",
"<|unused73|>",
"<|unused74|>",
"<|unused75|>",
"<|unused76|>",
"<|unused77|>",
"<|unused78|>",
"<|unused79|>",
"<|unused80|>",
"<|unused81|>",
"<|unused82|>",
"<|unused83|>",
"<|unused84|>",
"<|unused85|>",
"<|unused86|>",
"<|unused87|>",
"<|unused88|>",
"<|unused89|>",
"<|unused90|>",
"<|unused91|>",
"<|unused92|>",
"<|unused93|>",
"<|unused94|>",
"<|unused95|>",
"<|unused96|>",
"<|unused97|>",
"<|unused98|>",
"<|unused99|>"
],
"chat_template": "{% if messages[0]['role'] == 'system' %}{{ '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
"clean_up_tokenization_spaces": false,
"cls_token": "<cls>",
"extra_special_tokens": {},
"mask_token": "<mask>",
"model_max_length": 2048,
"pad_token": "<pad>",
"sep_token": "<sep>",
"tokenizer_class": "PreTrainedTokenizerFast",
"unk_token": "<unk>"
}