#!/bin/bash set -euo pipefail ############################################################################### # run_distill_stage2_8gpu_accelerate_ds_zero2.sh # 8-GPU stage2 distillation training for OpenVLA via Accelerate + DeepSpeed # ZeRO-2 variant kept conservative on micro-batch to maximize launch stability. ############################################################################### # ======================== Conda 环境 ======================== CONDA_BASE="${CONDA_BASE:-/opt/conda}" OPENVLA_ENV="${OPENVLA_ENV:-/mnt/afs/lixiaoou/intern/wanyang/envs/openvla-triton-test}" source "${CONDA_BASE}/etc/profile.d/conda.sh" conda activate "${OPENVLA_ENV}" # ======================== GPU 配置 ======================== export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" num_gpus=8 main_process_port="${MAIN_PROCESS_PORT:-29522}" # ======================== 性能优化环境变量 ======================== export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}" export WANDB_MODE="${WANDB_MODE:-disabled}" # Persistent autotune cache can fail on long kernel signatures in this env. # Keep autotune enabled in-process, but disable on-disk persistence by default. export TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS="${TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS:-0}" if [[ "${TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS}" == "1" ]]; then export TRITON_AUTOTUNE_CONFIG_PATH="${TRITON_AUTOTUNE_CONFIG_PATH:-/tmp/triton_autotune_ovla}" mkdir -p "${TRITON_AUTOTUNE_CONFIG_PATH}" else unset TRITON_AUTOTUNE_CONFIG_PATH || true fi # 这轮测试需要真实走 Triton 路径,避免误吃到之前的 torch fallback 环境变量。 unset OPENVLA_GATED_DELTA_FORCE_TORCH || true unset OPENVLA_GATED_DELTA_FORCE_TORCH_CONV || true # 保持 teacher/student 默认加载路径,避免沿用其他实验里的覆盖变量。 unset OPENVLA_TEACHER_FORCE_CPU_LOAD || true unset OPENVLA_TEACHER_DISABLE_DEVICE_MAP || true # ======================== 路径 ======================== repo_root="/mnt/afs/lixiaoou/intern/wanyang/openvla" script_dir="${repo_root}/vla-scripts" data_root="/mnt/afs/lixiaoou/intern/wanyang" run_root="${script_dir}/runs" teacher_vla_path="/mnt/afs/lixiaoou/intern/wanyang/openvla-7b-model" deepspeed_config_file="${DEEPSPEED_CONFIG_FILE:-${script_dir}/deepspeed_stage2_zero2.json}" zero3_init_flag="${ZERO3_INIT_FLAG:-false}" accelerate_bin="${ACCELERATE_BIN:-${CONDA_PREFIX:-}/bin/accelerate}" python_bin="${PYTHON_BIN:-${CONDA_PREFIX:-}/bin/python3.10}" if [[ ! -x "${python_bin}" && -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then python_bin="${CONDA_PREFIX}/bin/python" fi # student 初始化方式 student_init_mode="local_checkpoint" # 可选: random / teacher / local_checkpoint student_checkpoint_path="/mnt/afs/lixiaoou/intern/wanyang/openvla/vla-scripts/runs/distill-stage1-from-openvla-7b-model-to-student-balanced+dset-bridge_orig+b8+lr-0.0002+law-1.0+mimic-1--with_ab/step-65000" cd "${repo_root}" export PYTHONPATH="${repo_root}:${PYTHONPATH:-}" if [[ ! -x "${accelerate_bin}" ]]; then echo "Accelerate launcher not found: ${accelerate_bin}" >&2 exit 1 fi if [[ ! -f "${deepspeed_config_file}" ]]; then echo "DeepSpeed config not found: ${deepspeed_config_file}" >&2 exit 1 fi if [[ ! -d "${teacher_vla_path}" ]]; then echo "Teacher checkpoint does not exist: ${teacher_vla_path}" >&2 exit 1 fi if [[ "${student_init_mode}" == "local_checkpoint" && ! -d "${student_checkpoint_path}" ]]; then echo "Student checkpoint does not exist: ${student_checkpoint_path}" >&2 exit 1 fi "${python_bin}" - <}" echo " GPUs: ${CUDA_VISIBLE_DEVICES}" echo " Triton persistent autotune: ${TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS}" echo " Triton autotune cache: ${TRITON_AUTOTUNE_CONFIG_PATH:-}" echo " Teacher: ${teacher_vla_path}" echo " Student init mode: ${student_init_mode}" if [[ "${student_init_mode}" == "local_checkpoint" ]]; then echo " Student checkpoint: ${student_checkpoint_path}" else echo " Student checkpoint: N/A" fi echo " Data root: ${data_root}" echo " DeepSpeed config: ${deepspeed_config_file}" echo " ZeRO-3 init flag (should be false): ${zero3_init_flag}" echo " Main process port: ${main_process_port}" echo "============================================================" "${accelerate_bin}" launch \ --num_processes "${num_gpus}" \ --num_machines 1 \ --mixed_precision bf16 \ --main_process_port "${main_process_port}" \ --use_deepspeed \ --deepspeed_config_file "${deepspeed_config_file}" \ --zero3_init_flag "${zero3_init_flag}" \ vla-scripts/distill_train_stage2.py \ "${default_args[@]}" \ "$@"