Adapter微调方法详解
Adapter是另一种经典的PEFT方法,通过在Transformer层间插入轻量级适配模块来实现高效微调。
Adapter核心设计
基本结构
Adapter采用瓶颈(Bottleneck)结构:
原始Transformer层:
Input → [Self-Attention] → [FFN] → Output
Adapter插入方式:
Input → [Self-Attention] → [Adapter] → [FFN] → [Adapter] → Output
Adapter模块
import torch
import torch.nn as nn
class Adapter(nn.Module):
"""
标准Adapter模块
采用瓶颈设计:先降维 → 非线性变换 → 升维
"""
def __init__(self,
d_model: int,
reduction_factor: int = 16,
non_linearity: str = "relu"):
super().__init__()
self.d_model = d_model
self.reduction_factor = reduction_factor
# 计算瓶颈维度
bottleneck_dim = d_model // reduction_factor
# 下投影
self.down_project = nn.Linear(d_model, bottleneck_dim)
# 非线性激活
if non_linearity == "relu":
self.activation = nn.ReLU()
elif non_linearity == "gelu":
self.activation = nn.GELU()
# 上投影(带残差缩放)
self.up_project = nn.Linear(bottleneck_dim, d_model)
# 初始化:使初始状态接近恒等映射
nn.init.zeros_(self.up_project.weight)
nn.init.zeros_(self.up_project.bias)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# Adapter前向传播
h = self.down_project(x)
h = self.activation(h)
h = self.up_project(h)
# 残差连接 + 缩放
return x + h * 0.5 # 缩放因子防止初始化时扰动过大带LayerNorm的Adapter
class AdapterWithNorm(nn.Module):
"""
带LayerNorm的Adapter变体
"""
def __init__(self, d_model: int, reduction_factor: int = 16):
super().__init__()
# LayerNorm前置
self.norm = nn.LayerNorm(d_model)
# Adapter
self.adapter = Adapter(d_model, reduction_factor)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.norm(x)
return x + self.adapter(x)Adapter变体
1. 串行Adapter(Sequential Adapter)
标准Adapter放置方式:串行插入到各层。
class SequentialAdapterLayer(nn.Module):
"""
串行Adapter:按顺序执行
"""
def __init__(self, d_model: int, reduction_factor: int = 16):
super().__init__()
self.attention_adapter = Adapter(d_model, reduction_factor)
self.ffn_adapter = Adapter(d_model, reduction_factor)
# LayerNorm
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 自注意力后接Adapter
x = x + self.attention_adapter(self.norm1(x))
# FFN后接Adapter
x = x + self.ffn_adapter(self.norm2(x))
return x2. 并行Adapter(Parallel Adapter)
Adapter与原模块并行计算:
class ParallelAdapterLayer(nn.Module):
"""
并行Adapter:与原模块同时计算后相加
"""
def __init__(self, d_model: int, reduction_factor: int = 16):
super().__init__()
self.adapter = Adapter(d_model, reduction_factor)
def forward(self, x: torch.Tensor,
attention_output: torch.Tensor,
ffn_output: torch.Tensor) -> torch.Tensor:
"""
Args:
x: 残差输入
attention_output: 注意力模块输出
ffn_output: FFN模块输出
"""
# 并行计算Adapter
adapter_out = self.adapter(x)
# 与原模块输出相加
return attention_output + ffn_output + adapter_out并行vs串行对比:
| 特性 | 串行Adapter | 并行Adapter |
|---|---|---|
| 参数量 | 相同 | 相同 |
| 计算开销 | 较高(层序贯) | 略低(可并行) |
| 收敛速度 | 较慢 | 较快 |
| 效果 | 相当 | 略优 |
3. CoDA (Conditional Adapter)
根据输入动态启用/跳过Adapter:
class CoDAAdapter(nn.Module):
"""
条件激活的Adapter
"""
def __init__(self, d_model: int, reduction_factor: int = 16):
super().__init__()
self.adapter = Adapter(d_model, reduction_factor)
# 门控机制
self.gate = nn.Sequential(
nn.Linear(d_model, d_model // 4),
nn.Sigmoid()
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 计算门控值
gate_value = self.gate(x.mean(dim=1, keepdim=True))
# Adapter输出
adapter_out = self.adapter(x)
# 门控调制
return x + adapter_out * gate_value4. AdapterFusion
多任务Adapter组合:学习如何组合多个Adapter的知识。
class AdapterFusion(nn.Module):
"""
AdapterFusion: 融合多个Adapter的知识
用于多任务学习场景
"""
def __init__(self, d_model: int, num_adapters: int):
super().__init__()
self.num_adapters = num_adapters
# 融合注意力
self.fusion_query = nn.Linear(d_model, d_model)
self.fusion_key = nn.Linear(d_model, d_model)
self.fusion_value = nn.Linear(d_model, d_model)
# 输出投影
self.output_proj = nn.Linear(d_model, d_model)
def forward(self,
adapter_outputs: List[torch.Tensor]) -> torch.Tensor:
"""
Args:
adapter_outputs: 多个Adapter的输出列表
"""
# Stack: [num_adapters, batch, seq_len, d_model]
stacked = torch.stack(adapter_outputs, dim=0)
# 融合注意力
Q = self.fusion_query(stacked.mean(dim=2)) # [num_adapters, batch, d_model]
K = self.fusion_key(stacked) # [num_adapters, batch, seq, d_model]
V = self.fusion_value(stacked) # [num_adapters, batch, seq, d_model]
# Attention over adapters
scores = torch.einsum('bbd,nbbd->nb', Q, K) / math.sqrt(Q.shape[-1])
attn_weights = F.softmax(scores, dim=0)
# 加权融合
fused = torch.einsum('nb,nbd->bd', attn_weights, V.mean(dim=2))
return self.output_proj(fused)5. AdapterDrop
随机丢弃Adapter以提高推理效率:
class AdapterDrop(nn.Module):
"""
随机丢弃Adapter的Adapter变体
"""
def __init__(self, d_model: int, reduction_factor: int = 16):
super().__init__()
self.adapter = Adapter(d_model, reduction_factor)
self.drop_rate = 0.5
def forward(self, x: torch.Tensor,
layer_idx: int) -> torch.Tensor:
# 浅层Adapter更容易被丢弃
drop_prob = min(self.drop_rate * (layer_idx / 12), 0.5)
if self.training and torch.rand(1).item() < drop_prob:
# 跳过Adapter
return x
else:
return x + self.adapter(x) * 0.56. Multi-Head Adapter
class MultiHeadAdapter(nn.Module):
"""
多头Adapter:将特征空间分成多个头独立适配
"""
def __init__(self, d_model: int, num_heads: int = 4):
super().__init__()
self.num_heads = num_heads
self.head_dim = d_model // num_heads
# 多头并行
self.down_project = nn.ModuleList([
nn.Linear(self.head_dim, self.head_dim // 4)
for _ in range(num_heads)
])
self.up_project = nn.ModuleList([
nn.Linear(self.head_dim // 4, self.head_dim)
for _ in range(num_heads)
])
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, T, D = x.shape
# Reshape for multi-head
x = x.view(B, T, self.num_heads, self.head_dim)
# 并行处理各头
outputs = []
for h in range(self.num_heads):
h_out = self.down_project[h](x[:, :, h, :])
h_out = F.relu(h_out)
h_out = self.up_project[h](h_out)
outputs.append(h_out)
# Concat heads
return x.view(B, T, D) + torch.cat(outputs, dim=-1)Adapter vs LoRA对比
| 特性 | Adapter | LoRA |
|---|---|---|
| 结构 | 瓶颈网络 | 低秩分解 |
| 推理开销 | 有(额外层) | 几乎无(可合并) |
| 参数量 | ~1-5% | ~0.1-5% |
| 训练速度 | 较慢 | 较快 |
| 效果 | 良好 | 相当或更好 |
| 多任务 | AdapterFusion天然支持 | 需要额外设计 |
实践配置
标准配置
from transformers import AutoModel
from adapter_transformers import AdapterConfig, AutoAdapterModel
# 加载模型
model = AutoAdapterModel.from_pretrained("bert-base-uncased")
# 配置Adapter
adapter_config = AdapterConfig(
hidden_size=768,
reduction_factor=16, # 瓶颈维度 = 768/16 = 48
non_linearity="relu",
residual_before_ln=True,
adapter_residual_before_ln=False,
ln_after=False,
ln_before=False,
mh_adapter=False,
original_ln_after=True,
original_ln_before=False
)
# 添加Adapter
model.add_adapter("task_adapter", config=adapter_config)
# 激活Adapter
model.train_adapter("task_adapter")
# 冻结其他参数
model.freeze_model(non_trainable_prompt_modules=[])多任务配置
# 添加多个任务的Adapter
for task_name in ["sentiment", "ner", "qa"]:
model.add_adapter(task_name, config=adapter_config)
# 训练时切换
model.set_active_adapters("sentiment")在Transformer中插入Adapter
class TransformerWithAdapters(nn.Module):
def __init__(self, d_model: int, n_heads: int, d_ff: int):
super().__init__()
self.attention = MultiHeadAttention(d_model, n_heads)
self.ffn = FeedForward(d_model, d_ff)
# Adapter
self.attn_adapter = Adapter(d_model, reduction_factor=16)
self.ffn_adapter = Adapter(d_model, reduction_factor=16)
# LayerNorm
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.norm4 = nn.LayerNorm(d_model)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# Pre-LN Transformer with Adapter
# Norm -> Attention -> Adapter -> Add -> Norm -> FFN -> Adapter -> Add
x = x + self.attn_adapter(self.norm1(self.attention(x)))
x = x + self.ffn_adapter(self.norm2(self.ffn(x)))
return x