Skip to content

实测LOMO++ deepspeed zero2 7b qlora llama 显存占用 感觉比正常的qlora + deepspeed zero2 显存占用大1倍 #70

@zlh1992

Description

@zlh1992

qlora 8196
23366MiB / 81251MiB

config = LoraConfig(
r=8,
lora_alpha=32,
inference_mode=False,
target_modules=["q_pro","v_proj","down_proj","up_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)

测试代码:

llama

tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "right"
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2
tokenizer.pad_token_id = 0
tokenizer.unk_token_id = 0

# 初始化模型
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
config = LlamaConfig.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(
                                        model_id,
                                        torch_dtype=torch.bfloat16,
                                        quantization_config=bnb_config,
                                        load_in_4bit=True,
                                        use_safetensors=True, 
                                        config = config,
                                        device_map={"": int(os.environ.get("LOCAL_RANK") or 0)}
                                        )  
model.bos_token_id = 1
model.eos_token_id = 2
model.pad_token_id = 0
model.unk_token_id = 0

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

peft_params = []
non_peft_names = []
non_peft_params = []
for name, param in model.named_parameters():
    if param.requires_grad is False:
        continue
    non_peft_names.append(name)
    non_peft_params.append(param)


config = LoraConfig(
    r=8,
    lora_alpha=32,
    inference_mode=False, 
    target_modules=["q_pro","v_proj","down_proj","up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)

for name, param in model.named_parameters():
    if name.split('base_model.model.')[1] in non_peft_names:
        if not training_args.lora_only:
            param.requires_grad = True
    if "lora_" in name:
        peft_params.append(param)

torch.cuda.empty_cache()

# 加载数据
train_data = Dataset.from_list(ff)  # 略
valid_data = None

def tokenize(item, cutoff_len=cutoff_len):
    result = {}
    input_ids, labels, conversation = _addrole_masklabel_tokenize(item) # 略
    attention_mask = [1] * len(input_ids)
    result['input_ids'] = input_ids[:cutoff_len]
    result['attention_mask'] = attention_mask[:cutoff_len]
    result['labels'] = labels[:cutoff_len]
    return result

train_data = train_data.map(tokenize)

# ========== Initialize our Trainer. ==========
training_args = transformers.TrainingArguments(
        per_device_train_batch_size=bs,
        gradient_accumulation_steps=8,
        warmup_steps=1000,
        optim="paged_adamw_32bit",
        learning_rate=1e-5,
        num_train_epochs=2,
        fp16=True,
        logging_steps=10,
        save_strategy="steps",
        save_steps=100,
        output_dir=output_dir,
        save_total_limit=1,
        load_best_model_at_end=False,
        ddp_find_unused_parameters=False,  # if ddp else None,
        deepspeed="ds.config"
        # group_by_length=True
    )
training_args.lora_only = True
training_args.do_train = True
training_args.hf_weight_decay = 0.1
training_args.hf_lr_scheduler_type = "cosine"
training_args.clip_loss_value = 20.0
training_args.gradient_clipping = 10.0
trainer = LOMOLoRATrainer(
    model=model,
    training_args=training_args,
    data_collator={'train': DataCollatorForCauselLM(tokenizer, max_length=8196, padding_side='right'),
                   'eval': EvalDataCollatorForCauselLM(tokenizer, max_length=8196, padding_side='right')},
    train_dataset=train_data,
    eval_dataset=valid_data,
    tokenizer=tokenizer,
    compute_metrics=None,
    optimizers={'model_parameters': peft_params},
)
trainer.train()

ps: deepspeed+zero2 的qlora 大约占用13432MiB / 81251MiB

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions