PEFT实践指南

本指南提供PEFT方法从入门到实践的完整指导。

工具选择

主要工具对比

工具开发商特点适用场景
HuggingFace PEFTHF生态完善,支持最全通用场景
torchtuneMeta专为LLM优化LLaMA/Mistral
LLaMA-Factory国内开源一站式平台快速实验
DeepSpeedMicrosoftZeRO + PEFT分布式训练
Axolotl开源社区多框架支持高级用户

HuggingFace PEFT

pip install peft transformers accelerate bitsandbytes
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import torch
 
# 加载模型
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    torch_dtype=torch.bfloat16
)
 
# 配置LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
 
# 应用LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 41,843,520 || all params: 6,738,415,616 || trainable%: 0.621%

torchtune

pip install torchtune
# lora_llama2.yaml
recipe: lora_llama2
model:
  _component_: torchtune.models.llama2.lora_llama2_7b
  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
  apply_lora_to_mlp: True
  apply_lora_to_output: True
  lora_rank: 16
  lora_alpha: 32
 
dataset:
  _component_: torchtune.datasets.alpaca_dataset
  template: alpaca
 
batch_size: 4
# 启动训练
tune run lora_llama2 --config lora_llama2.yaml

LLaMA-Factory

# 安装
git clone https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e .
# examples/train_lora/llama2_lora.yaml
### model
model_name_or_path: meta-llama/Llama-2-7b-hf
 
### method
stage: sft
do_train: true
finetuning_type: lora
 
### lora
lora_rank: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target: all
 
### dataset
dataset: alpaca
template: llama2
cutoff_len: 2048
 
### training
output_dir: ./saves/llama2_lora
num_train_epochs: 3
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 2.0e-4

配置推荐

LLaMA系列

# LLaMA-2/3 微调推荐配置
lora_config = LoraConfig(
    r=16,                          # 秩
    lora_alpha=32,                 # 缩放因子
    target_modules=[               # 目标模块
        "q_proj", "k_proj", 
        "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
 
# 训练参数
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,    # 总batch=16
    learning_rate=2e-4,              # LoRA建议用较高学习率
    num_train_epochs=3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_steps=100,
    fp16=True,
    optim="paged_adamw_32bit",       # 内存优化
)

ChatGLM系列

# ChatGLM3 微调配置
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[               # ChatGLM的模块名不同
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

Mistral系列

# Mistral 微调配置
lora_config = LoraConfig(
    r=32,                          # Mistral效果较好
    lora_alpha=64,
    target_modules=[
        "q_proj", "k_proj", 
        "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

QLoRA配置

from transformers import BitsAndBytesConfig
 
# 4-bit量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",         # NormalFloat4
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,    # 双重量化
)
 
# 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)
 
# 应用LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)

完整训练示例

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
 
def train_llm():
    # 1. 加载模型
    model_name = "microsoft/phi-2"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # 2. 配置LoRA
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "dense"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(model, lora_config)
    
    # 3. 加载数据
    dataset = load_dataset("json", data_files="train.jsonl")
    
    def tokenize(example):
        result = tokenizer(
            example["text"],
            truncation=True,
            max_length=2048,
            return_tensors=None
        )
        result["labels"] = result["input_ids"].copy()
        return result
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    dataset = dataset.map(tokenize, remove_columns=dataset["train"].column_names)
    
    # 4. 训练参数
    training_args = TrainingArguments(
        output_dir="./output",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        learning_rate=3e-4,
        num_train_epochs=3,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        logging_steps=10,
        save_steps=100,
        fp16=True,
        optim="paged_adamw_32bit",
        report_to="tensorboard",
    )
    
    # 5. 开始训练
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
    trainer.train()
    
    # 6. 保存
    model.save_pretrained("./lora_output")
 
if __name__ == "__main__":
    train_llm()

推理与部署

加载LoRA权重推理

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
 
# 加载基础模型
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    torch_dtype=torch.float16
)
 
# 加载LoRA适配器
model = PeftModel.from_pretrained(
    base_model,
    "./lora_output"
)
 
# 合并权重(可选,推理更快)
model = model.merge_and_unload()
 
# 推理
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
prompt = "Translate to French: Hello, how are you?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
 
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=50)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

多LoRA动态切换

from peft import PeftModel
 
# 加载基础模型
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
 
# 加载多个LoRA
lora_paths = {
    "math": "./lohas/math",
    "code": "./lohas/code", 
    "creative": "./lohas/creative"
}
 
# 基础模型加载后,可以动态切换
for task, path in lora_paths.items():
    model = PeftModel.from_pretrained(base_model, path)
    # 使用对应任务的模型
    generate(model, task_prompt)

量化推理

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
 
# 加载量化基础模型
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    load_in_4bit=True,
    device_map="auto"
)
 
# 加载LoRA
model = PeftModel.from_pretrained(base_model, "./lora_output")
 
# 生成
model.eval()
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=50)

常见问题与解决

1. 显存不足

# 解决方案1: 使用QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
)
 
# 解决方案2: 梯度检查点
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
 
# 解决方案3: 减少batch size
per_device_train_batch_size=1
gradient_accumulation_steps=16

2. 训练不收敛

# 检查学习率
# LoRA通常需要较高的学习率 (1e-4 ~ 3e-4)
 
# 检查秩是否合适
# r太小: 欠拟合
# r太大: 浪费资源,可能过拟合
 
# 使用warmup
warmup_ratio=0.03

3. 灾难性遗忘

# 方案1: 使用更小的秩
r=4  # 而不是16或32
 
# 方案2: 增加权重衰减
weight_decay=0.1
 
# 方案3: 使用Adapter而不是LoRA

最佳实践

1. 训练前

# 检查数据集格式
print(dataset[0])
# {'text': '<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n{user_message} [/INST]{model_response}</s>'}
 
# 检查token长度分布
lengths = [len(tokenizer(x)['input_ids']) for x in dataset['text']]
print(f"Max: {max(lengths)}, Min: {min(lengths)}, Mean: {sum(lengths)/len(lengths)}")

2. 训练中

# 监控GPU显存
import torch
print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
 
# 监控学习率
# 使用cosine schedule配合warmup效果更好
 
# 定期保存checkpoint
save_steps=100

3. 训练后

# 评估模型
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(predictions, references, avg=True)
 
# 测试不同任务
test_prompts = [
    "Translate to French: Hello",
    "Summarize: " + long_text,
    "Answer: What is ML?"
]

参考