vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true,tensor_parallel_size=4,gpu_memory_utilization=0.8), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.900 |
± |
0.0190 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.896 |
± |
0.0193 |
vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true,tensor_parallel_size=4,gpu_memory_utilization=0.8), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.852 |
± |
0.0159 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.840 |
± |
0.0164 |
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7988 |
± |
0.0131 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.7897 |
± |
0.0269 |
| - other |
2 |
none |
|
acc |
↑ |
0.7590 |
± |
0.0298 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8722 |
± |
0.0252 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7860 |
± |
0.0230 |
vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated-awq,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true,tensor_parallel_size=2), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: 1
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.892 |
± |
0.0197 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.888 |
± |
0.0200 |
vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated-awq,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.864 |
± |
0.0153 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.862 |
± |
0.0154 |
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7871 |
± |
0.0131 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.8000 |
± |
0.0266 |
| - other |
2 |
none |
|
acc |
↑ |
0.7692 |
± |
0.0280 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8611 |
± |
0.0260 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7439 |
± |
0.0240 |