Skip to content

Unexpected Performance for Qwen2.5-7B-MAth #46

@BodongDu

Description

@BodongDu

This is my script:

------------------------------------------------------------

DATE=$(date +%m%d)
TIME_TAG=$(date +%H%M%S)

TASK="AIME-TTT"
BACKBONE="Qwen2.5-MAth-7B"
ADVANTAGE="grpo"

K=3
MAX_PROMPT_LENGTH=512
MAX_RESPONSE_LENGTH=$((1024 * $K))
if [ "$K" -gt 8 ]; then
N=4
else
N=16
fi

EPISODE=80
DATA_TRAIN_BATCH_SIZE=8
N_VOTES_PER_PROMPT=64 #64
N_SAMPLES_PER_PROMPT=32 #32
MINI_BATCH_SIZE=1
MICRO_BATCH_SIZE=2

DATA_LOCAL_DIR="/suanfs/usrhome/bduag/TTRL/verl/data"
BACKBONE_PATH="/suanfs/usrhome/bduag/${BACKBONE}"

MODEL="${TASK}-${BACKBONE}"
EXPERIMENT="TTRL-Len@${K}k"

WANDB_PROJECT="TTRL-verl"
LOG_NAME="${DATE}-${EXPERIMENT}-${MODEL}-${ADVANTAGE}"
OUTPUT_DIR="checkpoints/${WANDB_PROJECT}/${MODEL}/${DATE}/${EXPERIMENT}-${ADVANTAGE}-${TIME_TAG}"

------------------------------------------------------------

python -m verl.trainer.main_ppo
--config-name='ppo_trainer_ttrl.yaml'
data.train_files=["$DATA_LOCAL_DIR/$TASK/train.parquet"]
data.val_files=["$DATA_LOCAL_DIR/$TASK/test.parquet"]
data.max_prompt_length=$MAX_PROMPT_LENGTH
data.max_response_length=$MAX_RESPONSE_LENGTH
data.train_batch_size=$DATA_TRAIN_BATCH_SIZE
data.filter_overlong_prompts=True
data.truncation='error'
+data.suffix_prompt='"\nPlease reason step by step, and put your final answer within \boxed{}."'
actor_rollout_ref.model.path=$BACKBONE_PATH
actor_rollout_ref.model.enable_gradient_checkpointing=True
actor_rollout_ref.model.use_remove_padding=True
actor_rollout_ref.actor.ppo_mini_batch_size=$MINI_BATCH_SIZE
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.optim.lr=5e-7
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.03
actor_rollout_ref.actor.optim.warmup_style='cosine'
actor_rollout_ref.actor.fsdp_config.param_offload=True
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE
actor_rollout_ref.ref.fsdp_config.param_offload=True
actor_rollout_ref.rollout.name=vllm
actor_rollout_ref.rollout.temperature=0.6
actor_rollout_ref.rollout.enforce_eager=False
actor_rollout_ref.rollout.free_cache_engine=True
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE
actor_rollout_ref.rollout.tensor_model_parallel_size=1
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
actor_rollout_ref.rollout.n=$N_SAMPLES_PER_PROMPT
actor_rollout_ref.rollout.val_kwargs.do_sample=True
actor_rollout_ref.rollout.val_kwargs.n=$N
actor_rollout_ref.rollout.val_kwargs.top_p=0.95
actor_rollout_ref.rollout.val_kwargs.temperature=0.4
actor_rollout_ref.rollout.max_model_len=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))
actor_rollout_ref.rollout.max_num_batched_tokens=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))
critic.optim.lr=9e-6
critic.model.use_remove_padding=True
critic.model.path=$BACKBONE_PATH
critic.model.enable_gradient_checkpointing=True
critic.ppo_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE
critic.model.fsdp_config.param_offload=True
critic.model.fsdp_config.optimizer_offload=True
algorithm.kl_ctrl.kl_coef=0.00
algorithm.adv_estimator=$ADVANTAGE
custom_reward_function.path="./verl/utils/reward_score/ttrl_math/init.py"
custom_reward_function.name=reward_func
ttrl.enable=True
ttrl.n_votes_per_prompt=$N_VOTES_PER_PROMPT
ttrl.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT
trainer.logger=['console','wandb']
trainer.project_name=$WANDB_PROJECT
trainer.experiment_name=$LOG_NAME
trainer.n_gpus_per_node=4
trainer.nnodes=1
trainer.save_freq=2000000
trainer.test_freq=2
trainer.max_actor_ckpt_to_keep=0
trainer.max_critic_ckpt_to_keep=0
trainer.default_local_dir=$OUTPUT_DIR
trainer.total_epochs=$EPISODE "$@"

echo "Output directory: $OUTPUT_DIR"

This is my result:

Image

I need your help, thx!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions