Can we have the training setting?

#1
by Matchyc - opened

I see good results, can we have the exact setting of training?

Owner

That was a long time ago, I cant find the exact script I used. But here is the config recorded in wandb, hope it helps :)

_wandb:
value:
cli_version: 0.20.1
m: []
python_version: 3.10.12
t:
"1":
- 1
- 5
- 11
- 30
- 41
- 49
- 50
- 51
- 53
- 71
- 95
- 98
- 105
"2":
- 1
- 5
- 11
- 30
- 41
- 49
- 50
- 51
- 53
- 71
- 95
- 98
- 105
"3":
- 2
- 13
- 16
- 55
- 61
"4": 3.10.12
"5": 0.20.1
"6": 4.51.1
"12": 0.20.1
"13": linux-x86_64
actor_rollout_ref:
value:
actor:
checkpoint:
load_contents:
- model
- optimizer
- extra
save_contents:
- model
- optimizer
- extra
clip_ratio: 0.2
clip_ratio_c: 3
clip_ratio_high: 0.2
clip_ratio_low: 0.2
entropy_checkpointing: false
entropy_coeff: 0
entropy_from_logits_with_chunking: false
fsdp_config:
forward_prefetch: false
fsdp_size: -1
offload_policy: false
optimizer_offload: false
param_offload: false
reshard_after_forward: true
wrap_policy:
min_num_params: 0
grad_clip: 1
kl_loss_coef: 0.001
kl_loss_type: low_var_kl
loss_agg_mode: token-mean
optim:
lr: 1e-06
lr_warmup_steps: -1
lr_warmup_steps_ratio: 0
min_lr_ratio: 0
num_cycles: 0.5
total_training_steps: 35
warmup_style: constant
weight_decay: 0.01
policy_loss:
clip_cov_lb: 1
clip_cov_ratio: 0.0002
clip_cov_ub: 5
kl_cov_ratio: 0.0002
loss_mode: vanilla
ppo_kl_coef: 0.1
ppo_epochs: 1
ppo_max_token_len_per_gpu: 16384
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 20
ppo_mini_batch_size: 80
profiler:
all_ranks: false
discrete: false
ranks: []
shuffle: false
strategy: fsdp
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
use_kl_loss: true
use_torch_compile: true
hybrid_engine: true
model:
custom_chat_template: null
enable_activation_offload: false
enable_gradient_checkpointing: true
exclude_modules: null
external_lib: null
fused_kernel_options:
impl_backend: torch
lora_alpha: 16
lora_rank: 0
path: Qwen/Qwen3-1.7B
target_modules: all-linear
trust_remote_code: false
use_fused_kernels: false
use_liger: false
use_remove_padding: true
use_shm: false
ref:
entropy_checkpointing: false
entropy_from_logits_with_chunking: false
fsdp_config:
forward_prefetch: false
param_offload: true
reshard_after_forward: true
wrap_policy:
min_num_params: 0
log_prob_max_token_len_per_gpu: 16384
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 20
log_prob_use_dynamic_bsz: false
profiler:
all_ranks: false
discrete: false
ranks: []
strategy: fsdp
ulysses_sequence_parallel_size: 1
use_torch_compile: true
rollout:
agent:
custom_async_server:
name: null
path: null
num_workers: 8
calculate_log_probs: false
disable_log_stats: true
do_sample: true
dtype: bfloat16
enable_chunked_prefill: true
enforce_eager: true
engine_kwargs:
sglang:
attention_backend: null
vllm:
disable_mm_preprocessor_cache: false
swap_space: null
free_cache_engine: true
gpu_memory_utilization: 0.6
ignore_eos: false
layered_summon: false
load_format: dummy_dtensor
log_prob_max_token_len_per_gpu: 16384
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 20
log_prob_use_dynamic_bsz: false
max_model_len: null
max_num_batched_tokens: 8192
max_num_seqs: 1024
mode: sync
multi_stage_wake_up: false
multi_turn:
completion_callback: null
enable: false
format: hermes
interaction_config_path: null
max_assistant_turns: null
max_parallel_calls: 1
max_tool_response_length: 256
max_user_turns: null
tokenization_sanity_check_mode: strict
tool_config_path: null
tool_response_truncate_side: middle
use_inference_chat_template: false
"n": 3
name: vllm
profiler:
all_ranks: false
discrete: false
ranks: []
prompt_length: 512
response_length: 1024
temperature: 1
tensor_model_parallel_size: 1
top_k: -1
top_p: 1
val_kwargs:
do_sample: false
"n": 1
temperature: 0
top_k: -1
top_p: 1
algorithm:
value:
adv_estimator: grpo
filter_groups: null
gamma: 1
kl_ctrl:
horizon: 10000
kl_coef: 0.001
target_kl: 0.1
type: fixed
kl_penalty: kl
lam: 1
norm_adv_by_std_in_grpo: true
pf_ppo:
reweight_method: pow
weight_pow: 2
use_kl_in_reward: false
use_pf_ppo: false
critic:
value:
checkpoint:
load_contents:
- model
- optimizer
- extra
save_contents:
- model
- optimizer
- extra
cliprange_value: 0.5
forward_max_token_len_per_gpu: 32768
forward_micro_batch_size: null
forward_micro_batch_size_per_gpu: null
grad_clip: 1
loss_agg_mode: token-mean
model:
enable_activation_offload: false
enable_gradient_checkpointing: true
external_lib: null
fsdp_config:
forward_prefetch: false
fsdp_size: -1
offload_policy: false
optimizer_offload: false
param_offload: false
reshard_after_forward: true
wrap_policy:
min_num_params: 0
lora_alpha: 16
lora_rank: 0
path: ~/models/deepseek-llm-7b-chat
target_modules: all-linear
tokenizer_path: Qwen/Qwen3-1.7B
trust_remote_code: false
use_remove_padding: false
use_shm: false
optim:
lr: 1e-05
lr_warmup_steps_ratio: 0
min_lr_ratio: null
total_training_steps: 35
warmup_style: constant
weight_decay: 0.01
ppo_epochs: 1
ppo_max_token_len_per_gpu: 32768
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: null
ppo_mini_batch_size: 80
profiler:
all_ranks: false
discrete: false
ranks: []
rollout_n: 3
shuffle: false
strategy: fsdp
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
custom_reward_function:
value:
name: compute_score
path: null
data:
value:
custom_cls:
name: null
path: null
dataloader_num_workers: 8
filter_overlong_prompts: true
filter_overlong_prompts_workers: 1
image_key: images
max_prompt_length: 512
max_response_length: 1024
prompt_key: prompt
return_full_prompt: false
return_raw_chat: false
return_raw_input_ids: false
reward_fn_key: data_source
sampler:
class_name: null
class_path: null
shuffle: true
tokenizer: null
train_batch_size: 1024
train_files: /workspace/verl/data/gsm8k/train.parquet
truncation: error
trust_remote_code: false
use_shm: false
val_batch_size: null
val_files: /workspace/verl/data/gsm8k/test.parquet
validation_shuffle: false
video_key: videos
ray_init:
value:
num_cpus: null
timeline_json_file: null
reward_model:
value:
enable: false
forward_max_token_len_per_gpu: 32768
launch_reward_fn_async: false
max_length: null
micro_batch_size: null
micro_batch_size_per_gpu: null
model:
external_lib: null
fsdp_config:
forward_prefetch: false
fsdp_size: -1
param_offload: false
reshard_after_forward: true
wrap_policy:
min_num_params: 0
input_tokenizer: Qwen/Qwen3-1.7B
path: ~/models/FsfairX-LLaMA3-RM-v0.1
trust_remote_code: false
use_fused_kernels: false
use_remove_padding: false
use_shm: false
profiler:
all_ranks: false
discrete: false
ranks: []
reward_manager: naive
sandbox_fusion:
max_concurrent: 64
memory_limit_mb: 1024
url: null
strategy: fsdp
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
trainer:
value:
balance_batch: true
controller_nsight_options:
cuda-graph-trace: graph
cuda-memory-usage: "true"
trace: cuda,nvtx,cublas,ucx
critic_warmup: 0
default_hdfs_dir: null
default_local_dir: checkpoints/verl_grpo_example_gsm8k/qwen3_1.7b_1x_h100_function_rm
del_local_ckpt_after_load: false
device: cuda
esi_redundant_time: 0
experiment_name: qwen3_1.7b_1x_h100_function_rm
log_val_generations: 0
logger:
- console
- wandb
max_actor_ckpt_to_keep: null
max_critic_ckpt_to_keep: null
n_gpus_per_node: 1
nnodes: 1
profile_steps: null
project_name: verl_grpo_example_gsm8k
ray_wait_register_center_timeout: 300
resume_from_path: null
resume_mode: auto
rollout_data_dir: null
save_freq: 20
test_freq: 5
total_epochs: 5
total_training_steps: null
val_before_train: true
val_only: false
validation_data_dir: null
worker_nsight_options:
capture-range: cudaProfilerApi
capture-range-end: null
cuda-graph-trace: graph
cuda-memory-usage: "true"
kill: none
trace: cuda,nvtx,cublas,ucx

Sign up or log in to comment