run sh: `/root/data/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 1 /root/data/envs/ms-swift/lib/python3.10/site-packages/swift/cli/sft.py --model /root/data/internlm2_5-1_8b-chat --train_type lora --dataset /root/data/datasets/output.jsonl --model_type internlm2 --torch_dtype bfloat16 --num_train_epochs 4 --per_device_train_batch_size 4 --learning_rate 5e-5 --warmup_ratio 0.1 --split_dataset_ratio 0 --lora_rank 8 --lora_alpha 32 --target_modules all-linear --gradient_accumulation_steps 2 --save_steps 2000 --save_total_limit 5 --gradient_checkpointing_kwargs {"use_reentrant": false} --logging_steps 5 --max_length 2048 --output_dir ./swift_output/InternLM2.5-1.8B-Lora --dataloader_num_workers 256 --model_author JimmyMa99 --model_name InternLM2.5-1.8B-Lora`
/root/data/envs/ms-swift/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libjpeg.so.62: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/data/envs/ms-swift/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  warnings.warn(_BETA_TRANSFORMS_WARNING)
/root/data/envs/ms-swift/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  warnings.warn(_BETA_TRANSFORMS_WARNING)
[INFO:swift] Successfully registered `/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/llm/dataset/data/dataset_info.json`.
[INFO:swift] rank: 0, local_rank: 0, world_size: 1, local_world_size: 1
[INFO:swift] Loading the model using model_dir: /root/data/internlm2_5-1_8b-chat
[INFO:swift] Setting args.lazy_tokenize: False
/root/data/envs/ms-swift/lib/python3.10/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
[INFO:swift] output_dir: /root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402
[INFO:swift] Global seed set to 42
[INFO:swift] args: TrainArguments(
_n_gpu=-1,
acc_steps=1,
acc_strategy=token,
accelerator_config={'dispatch_batches': False},
adafactor=False,
adalora_beta1=0.85,
adalora_beta2=0.85,
adalora_deltaT=1,
adalora_init_r=12,
adalora_orth_reg_weight=0.5,
adalora_target_r=8,
adalora_tfinal=0,
adalora_tinit=0,
adam_beta1=0.9,
adam_beta2=0.95,
adam_epsilon=1e-08,
adapter_act=gelu,
adapter_length=128,
adapters=[],
add_version=True,
agent_template=None,
aligner_lr=None,
attn_impl=None,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_storage=None,
bnb_4bit_quant_type=nf4,
bnb_4bit_use_double_quant=True,
boft_block_num=0,
boft_block_size=4,
boft_dropout=0.0,
boft_n_butterfly_factor=1,
channels=None,
check_model=True,
ckpt_dir=None,
columns={},
create_checkpoint_symlink=False,
custom_dataset_info=[],
custom_register_path=[],
data_seed=42,
dataloader_drop_last=False,
dataloader_num_workers=256,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
dataset=['/root/data/datasets/output.jsonl'],
dataset_num_proc=1,
dataset_shuffle=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=18000000,
debug=None,
deepspeed=None,
device_map=None,
disable_tqdm=None,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
download_mode=reuse_dataset_if_exists,
eval_accumulation_steps=None,
eval_datasets=[],
eval_datasets_args=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_generation_config=None,
eval_limit=None,
eval_on_start=False,
eval_steps=2000.0,
eval_strategy=steps,
eval_use_evalscope=False,
eval_use_gather_object=False,
evaluation_strategy=steps,
external_plugins=[],
fourier_n_frequency=2000,
fourier_scaling=300.0,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
freeze_aligner=True,
freeze_llm=False,
freeze_parameters=[],
freeze_parameters_ratio=0.0,
freeze_parameters_regex=None,
freeze_vit=True,
fsdp=,
fsdp_config=None,
fsdp_min_num_params=0,
fsdp_num=1,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
galore_cos_threshold=0.4,
galore_gamma_proj=2,
galore_optim_per_parameter=False,
galore_proj_bits=4,
galore_proj_group_size=256,
galore_proj_quant=False,
galore_proj_type=std,
galore_quantization=False,
galore_queue_size=5,
galore_rank=128,
galore_scale=1.0,
galore_target_modules=None,
galore_update_proj_gap=50,
galore_with_embedding=False,
generation_config=None,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=2,
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": false},
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hqq_axis=None,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=None,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_args_error=False,
ignore_data_skip=False,
include_for_metrics=[],
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
init_strategy=None,
init_weights=True,
interleave_prob=None,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
lazy_tokenize=False,
learning_rate=5e-05,
length_column_name=length,
lisa_activated_layers=0,
lisa_step_interval=20,
llamapro_num_groups=None,
llamapro_num_new_blocks=4,
load_args=False,
load_best_model_at_end=False,
load_data_args=False,
load_from_cache_file=True,
local_rank=0,
local_repo_path=None,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402/runs,
logging_first_step=True,
logging_nan_inf_filter=True,
logging_steps=5,
logging_strategy=steps,
logprobs=False,
lora_alpha=32,
lora_bias=none,
lora_dropout=0.05,
lora_dtype=None,
lora_ga_batch_size=2,
lora_ga_direction=ArB2r,
lora_ga_iters=2,
lora_ga_max_length=1024,
lora_ga_scale=stable,
lora_ga_stable_gamma=16,
lora_modules=[],
lora_rank=8,
lorap_lr_ratio=None,
loss_scale=default,
loss_type=None,
lr_scheduler_kwargs=None,
lr_scheduler_type=cosine,
max_epochs=None,
max_grad_norm=1.0,
max_length=2048,
max_memory={},
max_new_tokens=64,
max_pixels=None,
max_steps=-1,
metric=None,
metric_for_best_model=loss,
metric_warmup_step=0,
model=/root/data/internlm2_5-1_8b-chat,
model_author=['JimmyMa99'],
model_kwargs={},
model_name=['InternLM2.5-1.8B-Lora'],
model_revision=None,
model_type=internlm2,
modules_to_save=[],
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
norm_bbox=None,
num_beams=1,
num_labels=None,
num_train_epochs=4.0,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
optimizer=None,
output_dir=/root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402,
overwrite_output_dir=False,
packing=False,
packing_cache=None,
padding_free=False,
padding_side=right,
past_index=-1,
per_device_eval_batch_size=1,
per_device_train_batch_size=4,
predict_with_generate=False,
prediction_loss_only=False,
problem_type=None,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
quant_bits=None,
quant_method=None,
ray_scope=last,
reft_args=None,
reft_intervention_type=LoreftIntervention,
reft_layer_key=None,
reft_layers=None,
reft_rank=4,
remove_unused_columns=True,
repetition_penalty=None,
report_to=['tensorboard'],
response_prefix=None,
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
resume_only_model=False,
rope_scaling=None,
run_name=/root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=2000.0,
save_strategy=steps,
save_total_limit=5,
seed=42,
sequence_parallel_size=1,
shuffle_buffer_size=1000,
skip_memory_metrics=True,
sortish_sampler=False,
split_batches=None,
split_dataset_ratio=0.0,
stop_words=[],
stopping_strategy=first_exhausted,
stream=False,
streaming=False,
strict=False,
swanlab_exp_name=None,
swanlab_mode=cloud,
swanlab_project=None,
swanlab_token=<SWANLAB_TOKEN>,
swanlab_workspace=None,
system=None,
target_modules=['all-linear'],
target_regex=None,
task_type=causal_lm,
temperature=0.0,
template=internlm2,
template_backend=swift,
tf32=None,
top_k=None,
top_logprobs=None,
top_p=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_dtype=torch.bfloat16,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
train_dataloader_shuffle=True,
train_type=lora,
trainable_parameters=[],
trainable_parameters_regex=None,
truncation_strategy=delete,
tuner_backend=peft,
use_chat_template=True,
use_cpu=False,
use_dora=False,
use_galore=False,
use_hf=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_liger_kernel=False,
use_logits_to_keep=None,
use_mps_device=False,
use_rslora=False,
use_swift_lora=False,
val_dataset=[],
val_dataset_shuffle=False,
vera_d_initial=0.1,
vera_dropout=0.0,
vera_projection_prng_key=0,
vera_rank=256,
vit_gradient_checkpointing=None,
vit_lr=None,
warmup_ratio=0.1,
warmup_steps=0,
weight_decay=0.1,
zero_hpz_partition_size=None,
)
[INFO:swift] Loading the model using model_dir: /root/data/internlm2_5-1_8b-chat
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
Loading checkpoint shards:   0%|                                                                                                                                                        | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|████████████████████████████████████████████████████████████████████████                                                                        | 1/2 [00:00<00:00,  1.01it/s]Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.03it/s]Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.02it/s]
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_info: ModelInfo(model_type='internlm2', model_dir='/root/data/internlm2_5-1_8b-chat', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling={'type': 'dynamic', 'factor': 2.0}, config=InternLM2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/root/data/internlm2_5-1_8b-chat",
  "architectures": [
    "InternLM2ForCausalLM"
  ],
  "attn_implementation": "eager",
  "auto_map": {
    "AutoConfig": "configuration_internlm2.InternLM2Config",
    "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
    "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
    "AutoModelForSequenceClassification": "modeling_internlm2.InternLM2ForSequenceClassification"
  },
  "bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 32768,
  "model_type": "internlm2",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 8,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 2.0,
    "type": "dynamic"
  },
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.48.0",
  "use_cache": true,
  "vocab_size": 92544
}
, task_type='causal_lm', num_labels=None)
[INFO:swift] model.generation_config: GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    2,
    92542
  ],
  "max_new_tokens": 64,
  "pad_token_id": 2
}

[INFO:swift] default_system: 'You are an AI assistant whose name is InternLM (书生·浦语).\n- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.'
[INFO:swift] max_length: 2048
[INFO:swift] response_prefix: ''
[INFO:swift] agent_template: react_en
[INFO:swift] norm_bbox: norm1000
[INFO:swift] Start time of running main: 2025-07-06 23:14:06.123223
[INFO:swift] swift.__version__: 3.5.3
[INFO:swift] SelfCognitionPreprocessor has been successfully configured with name: ('InternLM2.5-1.8B-Lora', 'InternLM2.5-1.8B-Lora'), author: ('JimmyMa99', 'JimmyMa99').
Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1000 examples [00:00, 54698.09 examples/s]
Map:   0%|                                                                                                                                                                    | 0/1000 [00:00<?, ? examples/s]Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 46686.90 examples/s]
[INFO:swift] train_dataset: Dataset({
    features: ['messages'],
    num_rows: 1000
})
[INFO:swift] val_dataset: None
Map:   0%|                                                                                                                                                                    | 0/1000 [00:00<?, ? examples/s]Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 926.73 examples/s]Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 913.66 examples/s]
[INFO:swift] [INPUT_IDS] [1, 92543, 9081, 364, 60403, 69072, 71670, 69526, 69412, 60703, 92542, 364, 92543, 1008, 364, 29044, 519, 410, 2416, 495, 4038, 3455, 27708, 435, 459, 2185, 337, 650, 3622, 14337, 2032, 2066, 18882, 12166, 495, 4430, 708, 39065, 13370, 648, 454, 8250, 495, 1803, 4420, 410, 16869, 5882, 5579, 446, 395, 8082, 7190, 9260, 446, 410, 516, 1366, 307, 334, 410, 3622, 14337, 2032, 2066, 313, 17621, 281, 707, 17065, 446, 644, 548, 14149, 454, 1172, 8503, 3152, 3455, 29743, 1237, 11516, 435, 24751, 7118, 3266, 519, 21551, 7935, 410, 11915, 5061, 657, 22189, 281, 23140, 328, 713, 1620, 560, 7182, 446, 2117, 18897, 15675, 2881, 435, 11579, 1628, 11516, 777, 4165, 589, 7190, 3865, 442, 7002, 9894, 2145, 519, 1678, 27708, 25717, 15803, 4648, 8385, 410, 12495, 5737, 446, 550, 5722, 512, 290, 281, 10408, 285, 891, 364, 309, 281, 21551, 281, 2561, 285, 891, 364, 289, 281, 21551, 6998, 447, 285, 891, 364, 302, 281, 9890, 1594, 397, 49197, 364, 288, 281, 10672, 281, 1430, 364, 311, 281, 10672, 35866, 364, 316, 281, 10672, 26517, 364, 318, 281, 10672, 2601, 296, 364, 295, 281, 697, 276, 7704, 364, 338, 281, 697, 276, 285, 891, 364, 331, 281, 21551, 28458, 1357, 364, 303, 281, 10672, 1011, 295, 364, 307, 281, 10672, 860, 322, 364, 304, 281, 11059, 695, 7704, 364, 306, 281, 46134, 285, 891, 364, 299, 281, 7041, 1211, 296, 364, 343, 281, 10672, 55926, 364, 296, 281, 514, 564, 942, 299, 364, 287, 281, 7041, 281, 7752, 364, 291, 281, 7041, 1046, 287, 364, 315, 281, 7041, 1046, 316, 364, 322, 281, 7041, 1467, 299, 364, 325, 281, 10672, 52501, 364, 341, 281, 2957, 1467, 288, 364, 337, 281, 7041, 860, 306, 364, 349, 281, 10672, 2219, 288, 92542, 364, 92543, 525, 11353, 364, 338, 92542]
[INFO:swift] [INPUT] <s><|im_start|>system
你是个优秀的论文分类师<|im_end|>
<|im_start|>user
Based on the title 'Flavor Physics in SUSY at large tan(beta)', authors 'Paride Paradisi', and abstract 'We discuss the phenomenological impact of a particularly interesting corner of the MSSM: the large tan(beta) regime. The capabilities of leptonic and hadronic Flavor Violating processes in shedding light on physics beyond the Standard Model are reviewed. Moreover, we show that tests of Lepton Universality in charged current processes can represent an interesting handle to obtain relevant information on New Physics scenarios.', please determine the scientific category of this paper.

A. quant-ph
B. physics.chem-ph
C. physics.atom-ph
D. cond-mat.soft
E. cs.RO
F. cs.CL
G. cs.SE
H. cs.IR
I. hep-th
J. hep-ph
K. physics.optics
L. cs.AI
M. cs.CV
N. nucl-th
O. astro-ph
P. math.PR
Q. cs.OS
R. eess.SP
S. math.OC
T. math.DS
U. math.DG
V. math.MP
W. cs.MM
X. stat.ME
Y. math.CO
Z. cs.NE<|im_end|>
<|im_start|>assistant
J<|im_end|>
[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 338, 92542]
[INFO:swift] [LABELS] [-100 * 297]J<|im_end|>
Map:   0%|                                                                                                                                                                    | 0/1000 [00:00<?, ? examples/s]Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2799.87 examples/s]Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2714.95 examples/s]
[INFO:swift] Dataset Token Length: 452.690000±131.581906, min=228.000000, max=1099.000000, size=1000
[INFO:swift] The TrainArguments will be saved in: /root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402/args.json
[INFO:swift] lora_config: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/root/data/internlm2_5-1_8b-chat', revision=None, inference_mode=False, r=8, target_modules={'w3', 'w1', 'wo', 'wqkv', 'w2'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)
[INFO:swift] model: PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): InternLM2ForCausalLM(
      (model): InternLM2Model(
        (tok_embeddings): Embedding(92544, 2048, padding_idx=2)
        (layers): ModuleList(
          (0-23): 24 x InternLM2DecoderLayer(
            (attention): InternLM2Attention(
              (wqkv): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (wo): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (rotary_emb): InternLM2DynamicNTKScalingRotaryEmbedding()
            )
            (feed_forward): InternLM2MLP(
              (w1): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=8192, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (w3): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=8192, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (w2): lora.Linear(
                (base_layer): Linear(in_features=8192, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=8192, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (act_fn): SiLU()
            )
            (attention_norm): InternLM2RMSNorm()
            (ffn_norm): InternLM2RMSNorm()
          )
        )
        (norm): InternLM2RMSNorm()
      )
      (output): Linear(in_features=2048, out_features=92544, bias=False)
    )
  )
)
[INFO:swift] model_parameter_info: PeftModelForCausalLM: 1896.9743M Params (7.8643M Trainable [0.4146%]), 0.0015M Buffers.
/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/trainers/mixin.py:89: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
  super().__init__(
[INFO:swift] The logging file will be saved in: /root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402/logging.jsonl
/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/utils/data/dataloader.py:624: UserWarning: This DataLoader will create 256 worker processes in total. Our suggested max number of worker in current system is 160, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(
[INFO:swift] last_model_checkpoint: None
[INFO:swift] best_model_checkpoint: None
[INFO:swift] images_dir: /root/data/swift_output/InternLM2.5-1.8B-Lora/v5-20250706-231402/images
[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/cli/sft.py", line 7, in <module>
[rank0]:     sft_main()
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/llm/train/sft.py", line 269, in sft_main
[rank0]:     return SwiftSft(args).main()
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/llm/base.py", line 49, in main
[rank0]:     result = self.run()
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/llm/train/sft.py", line 123, in run
[rank0]:     return self.train(trainer)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/llm/train/sft.py", line 184, in train
[rank0]:     trainer.train(trainer.args.resume_from_checkpoint)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/trainers/mixin.py", line 379, in train
[rank0]:     res = super().train(*args, **kwargs)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/transformers/trainer.py", line 2171, in train
[rank0]:     return inner_training_loop(
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/transformers/trainer.py", line 2330, in _inner_training_loop
[rank0]:     model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/accelerate/accelerator.py", line 1432, in prepare
[rank0]:     result = tuple(
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/accelerate/accelerator.py", line 1433, in <genexpr>
[rank0]:     self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/accelerate/accelerator.py", line 1281, in _prepare_one
[rank0]:     return self.prepare_model(obj, device_placement=device_placement)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/accelerate/accelerator.py", line 1644, in prepare_model
[rank0]:     model = torch.nn.parallel.DistributedDataParallel(
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 827, in __init__
[rank0]:     _sync_module_states(
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/utils.py", line 323, in _sync_module_states
[rank0]:     _sync_params_and_buffers(process_group, module_states, broadcast_bucket_size, src)
[rank0]:   File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/utils.py", line 334, in _sync_params_and_buffers
[rank0]:     dist._broadcast_coalesced(
[rank0]: TypeError: Input tensor data type is not supported for NCCL process group: BFloat16
[rank0]:[W706 23:14:10.110114050 ProcessGroupNCCL.cpp:1502] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
E0706 23:14:14.672000 1171 envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 0 (pid: 1175) of binary: /root/data/envs/ms-swift/bin/python3.10
Traceback (most recent call last):
  File "/root/data/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/root/data/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 922, in <module>
    main()
  File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
    return f(*args, **kwargs)
  File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 918, in main
    run(args)
  File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 909, in run
    elastic_launch(
  File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/data/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
/root/data/envs/ms-swift/lib/python3.10/site-packages/swift/cli/sft.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-07-06_23:14:14
  host      : ins-m7p8w-698894dd4d-5sbkl
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1175)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================