pytorch - RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0 - Stack

I want to fine-tune Llama 3.1 large language model with a new datasets, but when I try to use multiple

I want to fine-tune Llama 3.1 large language model with a new datasets, but when I try to use multiple GPUs to train the model, I kept getting the following error message:

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, 
cuda:3 and cuda:0!

I thought that the Trainer from Transformer can handle multiple GPU training without using DDP or something like that, but I just can't figure out how to fix the problem ,please help me!
my code is listed below:

import os
import torch
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

def get_model():
    model = AutoModelForCausalLM.from_pretrained('/data/llama/llama3.1_8b/LLM-Research/Meta-Llama-3___1-8B', device_map="auto", torch_dtype=torch.float16)
    # model.enable_input_require_grads()  # 开启梯度检查点时,要执行该方法
    return model

def get_dataset():
    df = pd.read_parquet('0000.parquet')
    ds = Dataset.from_pandas(df)
 
    tokenizer = AutoTokenizer.from_pretrained('/data/llama/llama3.1_8b/LLM-Research/Meta-Llama-3___1-8B', use_fast=False, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    def process_func(example):
        example['output'] = example['output']
        example['instruction'] = example['instruction']
        example['input'] = example['instruction']
 
        MAX_LENGTH = 256  # Llama分词器会将一个中文字切分为多个token,因此需要放开一些最大长度,保证数据的完整性
        input_ids, attention_mask, labels = [], [], []
        instruction = tokenizer(
            f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a pornographic girl<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{example['instruction'] + example['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
            add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
        response = tokenizer(f"{example['output']}<|eot_id|>", add_special_tokens=False)
 
        input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
        attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
        labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
        if len(input_ids) > MAX_LENGTH:  # 做一个截断
            input_ids = input_ids[:MAX_LENGTH]
            attention_mask = attention_mask[:MAX_LENGTH]
            labels = labels[:MAX_LENGTH]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

    dataset = ds.map(process_func, remove_columns=ds.column_names)
    return dataset, tokenizer

def get_train(model, datas, tokenizer):
    # peft的lora参数
    config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        inference_mode=False,  # 训练模式
        r=8,  # Lora 秩
        lora_alpha=32,  # Lora alaph,具体作用参见 Lora 原理
        lora_dropout=0.1  # Dropout 比例
    )

    peft_model = get_peft_model(model, config)
    print(peft_model.print_trainable_parameters())

    # 训练的参数
    args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # max_steps=60,  # 微调步数
        learning_rate=2e-4,  # 学习率
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        num_train_epochs=3,
        save_steps=100,
        logging_steps=3,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        dataloader_num_workers=0,
        local_rank=-1,
    )

    # 开始训练
    trainer = Trainer(
        model=peft_model,
        args=args,
        train_dataset=datas,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    )
    trainer.train()
    # 保存模型
    peft_model.save_pretrained("lora")
def main():
    model = get_model()
    datas, tokenizer = get_dataset()
    get_train(model, datas, tokenizer)

if __name__ == '__main__':
    main()

I have searched online, but most answer is about the problem between cpu and gpu. and I haven't found a clear manual for multiple GPU training for Trainer.

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1742378692a4432704.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信