PEFT实践指南
本指南提供PEFT方法从入门到实践的完整指导。
工具选择
主要工具对比
| 工具 | 开发商 | 特点 | 适用场景 |
|---|---|---|---|
| HuggingFace PEFT | HF | 生态完善,支持最全 | 通用场景 |
| torchtune | Meta | 专为LLM优化 | LLaMA/Mistral |
| LLaMA-Factory | 国内开源 | 一站式平台 | 快速实验 |
| DeepSpeed | Microsoft | ZeRO + PEFT | 分布式训练 |
| Axolotl | 开源社区 | 多框架支持 | 高级用户 |
HuggingFace PEFT
pip install peft transformers accelerate bitsandbytesfrom transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import torch
# 加载模型
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
device_map="auto",
torch_dtype=torch.bfloat16
)
# 配置LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM
)
# 应用LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 41,843,520 || all params: 6,738,415,616 || trainable%: 0.621%torchtune
pip install torchtune# lora_llama2.yaml
recipe: lora_llama2
model:
_component_: torchtune.models.llama2.lora_llama2_7b
lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: True
lora_rank: 16
lora_alpha: 32
dataset:
_component_: torchtune.datasets.alpaca_dataset
template: alpaca
batch_size: 4# 启动训练
tune run lora_llama2 --config lora_llama2.yamlLLaMA-Factory
# 安装
git clone https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e .# examples/train_lora/llama2_lora.yaml
### model
model_name_or_path: meta-llama/Llama-2-7b-hf
### method
stage: sft
do_train: true
finetuning_type: lora
### lora
lora_rank: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target: all
### dataset
dataset: alpaca
template: llama2
cutoff_len: 2048
### training
output_dir: ./saves/llama2_lora
num_train_epochs: 3
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 2.0e-4配置推荐
LLaMA系列
# LLaMA-2/3 微调推荐配置
lora_config = LoraConfig(
r=16, # 秩
lora_alpha=32, # 缩放因子
target_modules=[ # 目标模块
"q_proj", "k_proj",
"v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
# 训练参数
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4, # 总batch=16
learning_rate=2e-4, # LoRA建议用较高学习率
num_train_epochs=3,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
fp16=True,
optim="paged_adamw_32bit", # 内存优化
)ChatGLM系列
# ChatGLM3 微调配置
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=[ # ChatGLM的模块名不同
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h"
],
lora_dropout=0.1,
bias="none",
task_type=TaskType.CAUSAL_LM,
)Mistral系列
# Mistral 微调配置
lora_config = LoraConfig(
r=32, # Mistral效果较好
lora_alpha=64,
target_modules=[
"q_proj", "k_proj",
"v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)QLoRA配置
from transformers import BitsAndBytesConfig
# 4-bit量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NormalFloat4
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True, # 双重量化
)
# 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto"
)
# 应用LoRA
lora_config = LoraConfig(
r=64,
lora_alpha=128,
target_modules=["q_proj", "k_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)完整训练示例
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
def train_llm():
# 1. 加载模型
model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.float16
)
# 2. 配置LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "dense"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
# 3. 加载数据
dataset = load_dataset("json", data_files="train.jsonl")
def tokenize(example):
result = tokenizer(
example["text"],
truncation=True,
max_length=2048,
return_tensors=None
)
result["labels"] = result["input_ids"].copy()
return result
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = dataset.map(tokenize, remove_columns=dataset["train"].column_names)
# 4. 训练参数
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
learning_rate=3e-4,
num_train_epochs=3,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
fp16=True,
optim="paged_adamw_32bit",
report_to="tensorboard",
)
# 5. 开始训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()
# 6. 保存
model.save_pretrained("./lora_output")
if __name__ == "__main__":
train_llm()推理与部署
加载LoRA权重推理
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# 加载基础模型
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
device_map="auto",
torch_dtype=torch.float16
)
# 加载LoRA适配器
model = PeftModel.from_pretrained(
base_model,
"./lora_output"
)
# 合并权重(可选,推理更快)
model = model.merge_and_unload()
# 推理
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
prompt = "Translate to French: Hello, how are you?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))多LoRA动态切换
from peft import PeftModel
# 加载基础模型
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
# 加载多个LoRA
lora_paths = {
"math": "./lohas/math",
"code": "./lohas/code",
"creative": "./lohas/creative"
}
# 基础模型加载后,可以动态切换
for task, path in lora_paths.items():
model = PeftModel.from_pretrained(base_model, path)
# 使用对应任务的模型
generate(model, task_prompt)量化推理
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
# 加载量化基础模型
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
load_in_4bit=True,
device_map="auto"
)
# 加载LoRA
model = PeftModel.from_pretrained(base_model, "./lora_output")
# 生成
model.eval()
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)常见问题与解决
1. 显存不足
# 解决方案1: 使用QLoRA
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
)
# 解决方案2: 梯度检查点
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
# 解决方案3: 减少batch size
per_device_train_batch_size=1
gradient_accumulation_steps=162. 训练不收敛
# 检查学习率
# LoRA通常需要较高的学习率 (1e-4 ~ 3e-4)
# 检查秩是否合适
# r太小: 欠拟合
# r太大: 浪费资源,可能过拟合
# 使用warmup
warmup_ratio=0.033. 灾难性遗忘
# 方案1: 使用更小的秩
r=4 # 而不是16或32
# 方案2: 增加权重衰减
weight_decay=0.1
# 方案3: 使用Adapter而不是LoRA最佳实践
1. 训练前
# 检查数据集格式
print(dataset[0])
# {'text': '<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n{user_message} [/INST]{model_response}</s>'}
# 检查token长度分布
lengths = [len(tokenizer(x)['input_ids']) for x in dataset['text']]
print(f"Max: {max(lengths)}, Min: {min(lengths)}, Mean: {sum(lengths)/len(lengths)}")2. 训练中
# 监控GPU显存
import torch
print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
# 监控学习率
# 使用cosine schedule配合warmup效果更好
# 定期保存checkpoint
save_steps=1003. 训练后
# 评估模型
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(predictions, references, avg=True)
# 测试不同任务
test_prompts = [
"Translate to French: Hello",
"Summarize: " + long_text,
"Answer: What is ML?"
]