Adapter微调方法详解

Adapter是另一种经典的PEFT方法,通过在Transformer层间插入轻量级适配模块来实现高效微调。

Adapter核心设计

基本结构

Adapter采用瓶颈(Bottleneck)结构

原始Transformer层:
Input → [Self-Attention] → [FFN] → Output

Adapter插入方式:
Input → [Self-Attention] → [Adapter] → [FFN] → [Adapter] → Output

Adapter模块

import torch
import torch.nn as nn
 
class Adapter(nn.Module):
    """
    标准Adapter模块
    
    采用瓶颈设计:先降维 → 非线性变换 → 升维
    """
    def __init__(self, 
                 d_model: int, 
                 reduction_factor: int = 16,
                 non_linearity: str = "relu"):
        super().__init__()
        self.d_model = d_model
        self.reduction_factor = reduction_factor
        
        # 计算瓶颈维度
        bottleneck_dim = d_model // reduction_factor
        
        # 下投影
        self.down_project = nn.Linear(d_model, bottleneck_dim)
        
        # 非线性激活
        if non_linearity == "relu":
            self.activation = nn.ReLU()
        elif non_linearity == "gelu":
            self.activation = nn.GELU()
        
        # 上投影(带残差缩放)
        self.up_project = nn.Linear(bottleneck_dim, d_model)
        
        # 初始化:使初始状态接近恒等映射
        nn.init.zeros_(self.up_project.weight)
        nn.init.zeros_(self.up_project.bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Adapter前向传播
        h = self.down_project(x)
        h = self.activation(h)
        h = self.up_project(h)
        
        # 残差连接 + 缩放
        return x + h * 0.5  # 缩放因子防止初始化时扰动过大

带LayerNorm的Adapter

class AdapterWithNorm(nn.Module):
    """
    带LayerNorm的Adapter变体
    """
    def __init__(self, d_model: int, reduction_factor: int = 16):
        super().__init__()
        
        # LayerNorm前置
        self.norm = nn.LayerNorm(d_model)
        
        # Adapter
        self.adapter = Adapter(d_model, reduction_factor)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.norm(x)
        return x + self.adapter(x)

Adapter变体

1. 串行Adapter(Sequential Adapter)

标准Adapter放置方式:串行插入到各层。

class SequentialAdapterLayer(nn.Module):
    """
    串行Adapter:按顺序执行
    """
    def __init__(self, d_model: int, reduction_factor: int = 16):
        super().__init__()
        self.attention_adapter = Adapter(d_model, reduction_factor)
        self.ffn_adapter = Adapter(d_model, reduction_factor)
        
        # LayerNorm
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 自注意力后接Adapter
        x = x + self.attention_adapter(self.norm1(x))
        
        # FFN后接Adapter
        x = x + self.ffn_adapter(self.norm2(x))
        
        return x

2. 并行Adapter(Parallel Adapter)

Adapter与原模块并行计算:

class ParallelAdapterLayer(nn.Module):
    """
    并行Adapter:与原模块同时计算后相加
    """
    def __init__(self, d_model: int, reduction_factor: int = 16):
        super().__init__()
        self.adapter = Adapter(d_model, reduction_factor)
    
    def forward(self, x: torch.Tensor, 
                attention_output: torch.Tensor,
                ffn_output: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: 残差输入
            attention_output: 注意力模块输出
            ffn_output: FFN模块输出
        """
        # 并行计算Adapter
        adapter_out = self.adapter(x)
        
        # 与原模块输出相加
        return attention_output + ffn_output + adapter_out

并行vs串行对比

特性串行Adapter并行Adapter
参数量相同相同
计算开销较高(层序贯)略低(可并行)
收敛速度较慢较快
效果相当略优

3. CoDA (Conditional Adapter)

根据输入动态启用/跳过Adapter:

class CoDAAdapter(nn.Module):
    """
    条件激活的Adapter
    """
    def __init__(self, d_model: int, reduction_factor: int = 16):
        super().__init__()
        self.adapter = Adapter(d_model, reduction_factor)
        
        # 门控机制
        self.gate = nn.Sequential(
            nn.Linear(d_model, d_model // 4),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 计算门控值
        gate_value = self.gate(x.mean(dim=1, keepdim=True))
        
        # Adapter输出
        adapter_out = self.adapter(x)
        
        # 门控调制
        return x + adapter_out * gate_value

4. AdapterFusion

多任务Adapter组合:学习如何组合多个Adapter的知识。

class AdapterFusion(nn.Module):
    """
    AdapterFusion: 融合多个Adapter的知识
    
    用于多任务学习场景
    """
    def __init__(self, d_model: int, num_adapters: int):
        super().__init__()
        self.num_adapters = num_adapters
        
        # 融合注意力
        self.fusion_query = nn.Linear(d_model, d_model)
        self.fusion_key = nn.Linear(d_model, d_model)
        self.fusion_value = nn.Linear(d_model, d_model)
        
        # 输出投影
        self.output_proj = nn.Linear(d_model, d_model)
    
    def forward(self, 
                adapter_outputs: List[torch.Tensor]) -> torch.Tensor:
        """
        Args:
            adapter_outputs: 多个Adapter的输出列表
        """
        # Stack: [num_adapters, batch, seq_len, d_model]
        stacked = torch.stack(adapter_outputs, dim=0)
        
        # 融合注意力
        Q = self.fusion_query(stacked.mean(dim=2))  # [num_adapters, batch, d_model]
        K = self.fusion_key(stacked)               # [num_adapters, batch, seq, d_model]
        V = self.fusion_value(stacked)             # [num_adapters, batch, seq, d_model]
        
        # Attention over adapters
        scores = torch.einsum('bbd,nbbd->nb', Q, K) / math.sqrt(Q.shape[-1])
        attn_weights = F.softmax(scores, dim=0)
        
        # 加权融合
        fused = torch.einsum('nb,nbd->bd', attn_weights, V.mean(dim=2))
        
        return self.output_proj(fused)

5. AdapterDrop

随机丢弃Adapter以提高推理效率:

class AdapterDrop(nn.Module):
    """
    随机丢弃Adapter的Adapter变体
    """
    def __init__(self, d_model: int, reduction_factor: int = 16):
        super().__init__()
        self.adapter = Adapter(d_model, reduction_factor)
        self.drop_rate = 0.5
    
    def forward(self, x: torch.Tensor, 
                layer_idx: int) -> torch.Tensor:
        # 浅层Adapter更容易被丢弃
        drop_prob = min(self.drop_rate * (layer_idx / 12), 0.5)
        
        if self.training and torch.rand(1).item() < drop_prob:
            # 跳过Adapter
            return x
        else:
            return x + self.adapter(x) * 0.5

6. Multi-Head Adapter

class MultiHeadAdapter(nn.Module):
    """
    多头Adapter:将特征空间分成多个头独立适配
    """
    def __init__(self, d_model: int, num_heads: int = 4):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        # 多头并行
        self.down_project = nn.ModuleList([
            nn.Linear(self.head_dim, self.head_dim // 4)
            for _ in range(num_heads)
        ])
        self.up_project = nn.ModuleList([
            nn.Linear(self.head_dim // 4, self.head_dim)
            for _ in range(num_heads)
        ])
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, D = x.shape
        
        # Reshape for multi-head
        x = x.view(B, T, self.num_heads, self.head_dim)
        
        # 并行处理各头
        outputs = []
        for h in range(self.num_heads):
            h_out = self.down_project[h](x[:, :, h, :])
            h_out = F.relu(h_out)
            h_out = self.up_project[h](h_out)
            outputs.append(h_out)
        
        # Concat heads
        return x.view(B, T, D) + torch.cat(outputs, dim=-1)

Adapter vs LoRA对比

特性AdapterLoRA
结构瓶颈网络低秩分解
推理开销有(额外层)几乎无(可合并)
参数量~1-5%~0.1-5%
训练速度较慢较快
效果良好相当或更好
多任务AdapterFusion天然支持需要额外设计

实践配置

标准配置

from transformers import AutoModel
from adapter_transformers import AdapterConfig, AutoAdapterModel
 
# 加载模型
model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
# 配置Adapter
adapter_config = AdapterConfig(
    hidden_size=768,
    reduction_factor=16,  # 瓶颈维度 = 768/16 = 48
    non_linearity="relu",
    residual_before_ln=True,
    adapter_residual_before_ln=False,
    ln_after=False,
    ln_before=False,
    mh_adapter=False,
    original_ln_after=True,
    original_ln_before=False
)
 
# 添加Adapter
model.add_adapter("task_adapter", config=adapter_config)
 
# 激活Adapter
model.train_adapter("task_adapter")
 
# 冻结其他参数
model.freeze_model(non_trainable_prompt_modules=[])

多任务配置

# 添加多个任务的Adapter
for task_name in ["sentiment", "ner", "qa"]:
    model.add_adapter(task_name, config=adapter_config)
 
# 训练时切换
model.set_active_adapters("sentiment")

在Transformer中插入Adapter

class TransformerWithAdapters(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ff: int):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.ffn = FeedForward(d_model, d_ff)
        
        # Adapter
        self.attn_adapter = Adapter(d_model, reduction_factor=16)
        self.ffn_adapter = Adapter(d_model, reduction_factor=16)
        
        # LayerNorm
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.norm4 = nn.LayerNorm(d_model)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Pre-LN Transformer with Adapter
        # Norm -> Attention -> Adapter -> Add -> Norm -> FFN -> Adapter -> Add
        x = x + self.attn_adapter(self.norm1(self.attention(x)))
        x = x + self.ffn_adapter(self.norm2(self.ffn(x)))
        return x

参考