#!/bin/bash
set -euo pipefail

###############################################################################
# run_distill_stage2_8gpu_accelerate_ds_zero2.sh
# 8-GPU stage2 distillation training for OpenVLA via Accelerate + DeepSpeed
# ZeRO-2 variant kept conservative on micro-batch to maximize launch stability.
###############################################################################

# ======================== Conda 环境 ========================
CONDA_BASE="${CONDA_BASE:-/opt/conda}"
OPENVLA_ENV="${OPENVLA_ENV:-/mnt/afs/lixiaoou/intern/wanyang/envs/openvla-triton-test}"
source "${CONDA_BASE}/etc/profile.d/conda.sh"
conda activate "${OPENVLA_ENV}"

# ======================== GPU 配置 ========================
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
num_gpus=8
main_process_port="${MAIN_PROCESS_PORT:-29522}"

# ======================== 性能优化环境变量 ========================
export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"
export WANDB_MODE="${WANDB_MODE:-disabled}"
# Persistent autotune cache can fail on long kernel signatures in this env.
# Keep autotune enabled in-process, but disable on-disk persistence by default.
export TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS="${TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS:-0}"
if [[ "${TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS}" == "1" ]]; then
    export TRITON_AUTOTUNE_CONFIG_PATH="${TRITON_AUTOTUNE_CONFIG_PATH:-/tmp/triton_autotune_ovla}"
    mkdir -p "${TRITON_AUTOTUNE_CONFIG_PATH}"
else
    unset TRITON_AUTOTUNE_CONFIG_PATH || true
fi

# 这轮测试需要真实走 Triton 路径，避免误吃到之前的 torch fallback 环境变量。
unset OPENVLA_GATED_DELTA_FORCE_TORCH || true
unset OPENVLA_GATED_DELTA_FORCE_TORCH_CONV || true

# 保持 teacher/student 默认加载路径，避免沿用其他实验里的覆盖变量。
unset OPENVLA_TEACHER_FORCE_CPU_LOAD || true
unset OPENVLA_TEACHER_DISABLE_DEVICE_MAP || true

# ======================== 路径 ========================
repo_root="/mnt/afs/lixiaoou/intern/wanyang/openvla"
script_dir="${repo_root}/vla-scripts"
data_root="/mnt/afs/lixiaoou/intern/wanyang"
run_root="${script_dir}/runs"
teacher_vla_path="/mnt/afs/lixiaoou/intern/wanyang/openvla-7b-model"
deepspeed_config_file="${DEEPSPEED_CONFIG_FILE:-${script_dir}/deepspeed_stage2_zero2.json}"
zero3_init_flag="${ZERO3_INIT_FLAG:-false}"
accelerate_bin="${ACCELERATE_BIN:-${CONDA_PREFIX:-}/bin/accelerate}"
python_bin="${PYTHON_BIN:-${CONDA_PREFIX:-}/bin/python3.10}"

if [[ ! -x "${python_bin}" && -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then
    python_bin="${CONDA_PREFIX}/bin/python"
fi

# student 初始化方式
student_init_mode="local_checkpoint"   # 可选: random / teacher / local_checkpoint
student_checkpoint_path="/mnt/afs/lixiaoou/intern/wanyang/openvla/vla-scripts/runs/distill-stage1-from-openvla-7b-model-to-student-balanced+dset-bridge_orig+b8+lr-0.0002+law-1.0+mimic-1--with_ab/step-65000"

cd "${repo_root}"
export PYTHONPATH="${repo_root}:${PYTHONPATH:-}"

if [[ ! -x "${accelerate_bin}" ]]; then
    echo "Accelerate launcher not found: ${accelerate_bin}" >&2
    exit 1
fi

if [[ ! -f "${deepspeed_config_file}" ]]; then
    echo "DeepSpeed config not found: ${deepspeed_config_file}" >&2
    exit 1
fi

if [[ ! -d "${teacher_vla_path}" ]]; then
    echo "Teacher checkpoint does not exist: ${teacher_vla_path}" >&2
    exit 1
fi

if [[ "${student_init_mode}" == "local_checkpoint" && ! -d "${student_checkpoint_path}" ]]; then
    echo "Student checkpoint does not exist: ${student_checkpoint_path}" >&2
    exit 1
fi

"${python_bin}" - <<PY
import importlib.util
missing = [name for name in ("accelerate", "deepspeed") if importlib.util.find_spec(name) is None]
if missing:
    raise SystemExit(
        "Missing required packages for Accelerate + DeepSpeed launch: "
        + ", ".join(missing)
        + ". Please install them into the openvla environment first."
    )
print("Accelerate + DeepSpeed dependencies found.")
PY

"${python_bin}" - <<PY
from prismatic.extern.hf.checkpoint_utils import sync_openvla_checkpoint_artifacts

teacher_ckpt = "${teacher_vla_path}"
teacher_copied = sync_openvla_checkpoint_artifacts(teacher_ckpt, include_dataset_statistics_from_parent=True)
if teacher_copied:
    print(f"Prepared teacher checkpoint {teacher_ckpt} with: {', '.join(teacher_copied)}")
else:
    print(f"Teacher checkpoint {teacher_ckpt} already had the required local artifacts.")

student_mode = "${student_init_mode}"
student_ckpt = "${student_checkpoint_path}"
if student_mode == "local_checkpoint":
    student_copied = sync_openvla_checkpoint_artifacts(student_ckpt, include_dataset_statistics_from_parent=True)
    if student_copied:
        print(f"Prepared student checkpoint {student_ckpt} with: {', '.join(student_copied)}")
    else:
        print(f"Student checkpoint {student_ckpt} already had the required local artifacts.")
PY

# ======================== 训练参数 ========================
default_args=(
    # 模型
    --teacher_vla_path                 "${teacher_vla_path}"
    --student_config_strategy          "balanced"
    --gated_delta_mimic_init           "False"
    --student_init_mode                "${student_init_mode}"
    --student_train_strategy           "llm_backbone"

    # stage2 蒸馏
    --distillation_temperature         "2.0"
    --ce_loss_weight                   "1.0"
    --kl_weight                        "1.0"

    # 数据
    --data_root_dir                    "${data_root}"
    --dataset_name                     "bridge_orig"
    --run_root_dir                     "${run_root}"

    # 训练
    --batch_size                       "1"
    --grad_accumulation_steps          "8"
    --max_steps                        "200000"
    --save_steps                       "100"
    --save_model_steps                 "5000"
    --learning_rate                    "1e-4"
    --weight_decay                     "0.01"
    --max_grad_norm                    "1.0"
    --image_aug                        "True"
    --use_amp                          "True"

    # 数据加载
    --shuffle_buffer_size              "100000"
    --num_workers                      "4"
    --pin_memory                       "True"
    --persistent_workers               "True"
    --prefetch_factor                  "4"
    --train_action_metrics_interval    "0"
    --train_batch_log_interval         "0"

    # resume / 保存
    --restore_training_state           "False"
    --resume_skip_batches              "False"
    --strict_resume                    "False"

    # W&B
    --use_wandb                        "True"
    --wandb_project                    "openvla-distill"
    --wandb_batch_log_interval         "0"
    --wandb_step_log_interval          "10"

    # 显存 / 加载
    --local_files_only                 "True"
    --teacher_load_in_4bit             "False"
    --teacher_load_in_8bit             "False"
    --gradient_checkpointing           "True"
    --use_8bit_optimizer               "False"
)

if [[ "${student_init_mode}" == "local_checkpoint" ]]; then
    default_args+=(
        --student_checkpoint_path      "${student_checkpoint_path}"
    )
fi

echo "============================================================"
echo " Launching 8-GPU stage2 distillation via Accelerate + DeepSpeed (ZeRO-2)"
echo " OpenVLA env: ${OPENVLA_ENV}"
echo " Conda prefix: ${CONDA_PREFIX:-<unset>}"
echo " GPUs: ${CUDA_VISIBLE_DEVICES}"
echo " Triton persistent autotune: ${TRITON_ENABLE_PERSISTENT_AUTOTUNE_CONFIGS}"
echo " Triton autotune cache: ${TRITON_AUTOTUNE_CONFIG_PATH:-<disabled>}"
echo " Teacher: ${teacher_vla_path}"
echo " Student init mode: ${student_init_mode}"
if [[ "${student_init_mode}" == "local_checkpoint" ]]; then
    echo " Student checkpoint: ${student_checkpoint_path}"
else
    echo " Student checkpoint: N/A"
fi
echo " Data root: ${data_root}"
echo " DeepSpeed config: ${deepspeed_config_file}"
echo " ZeRO-3 init flag (should be false): ${zero3_init_flag}"
echo " Main process port: ${main_process_port}"
echo "============================================================"

"${accelerate_bin}" launch \
    --num_processes "${num_gpus}" \
    --num_machines 1 \
    --mixed_precision bf16 \
    --main_process_port "${main_process_port}" \
    --use_deepspeed \
    --deepspeed_config_file "${deepspeed_config_file}" \
    --zero3_init_flag "${zero3_init_flag}" \
    vla-scripts/distill_train_stage2.py \
    "${default_args[@]}" \
    "$@"