MC Dropout

MC Dropout(Monte Carlo Dropout)是深度学习中一种简洁而强大的不确定性估计方法,由 Gal & Ghahramani 在 2016 年提出。1 核心思想是将训练时使用的 Dropout 正则化解释为变分推断的近似,从而在测试时通过多次采样获取预测的不确定性。

Dropout 的概率解释

标准 Dropout 回顾

传统 Dropout 在训练时随机将神经元的输出置零:

测试时使用完整网络(不Dropout):

从变分推断角度看 Dropout

MC Dropout 的核心洞见是:带 Dropout 的网络等价于一个变分近似后验

考虑一个深度高斯过程(Deep Gaussian Process),其边缘似然为:

通过变分推断近似,可以推导出带 Dropout 的网络等价于对该高斯过程进行近似贝叶斯推断。

数学推导

设网络权重 的变分近似后验为:

其中 是可学习参数。

关键结论:当 为 Dropout 概率)时,最大化变分下界等价于最小化带 Dropout 的交叉熵损失。

这意味着:

  • 训练时使用 Dropout = 训练变分推断
  • 测试时使用 Dropout = 从后验采样

MC Dropout 推断

预测均值

对于回归任务,预测均值通过多次前向传播的平均得到:

其中 是从 Dropout 网络中采样的权重, 是采样次数。

预测方差

预测方差包含两部分:

其中 是精度参数(由数据噪声决定)。

def mc_dropout_predict(model, x, n_samples=50, tau=1.0):
    """
    MC Dropout 预测
    
    Args:
        model: 带 Dropout 层的模型
        x: 输入 (batch_size, ...)
        n_samples: 采样次数
        tau: 精度参数
    
    Returns:
        mean: 预测均值
        variance: 预测方差
    """
    model.train()  # 保持 Dropout 开启
    
    predictions = []
    with torch.no_grad():
        for _ in range(n_samples):
            pred = model(x)
            predictions.append(pred)
    
    predictions = torch.stack(predictions)  # (T, batch, output)
    
    # 计算均值
    mean = predictions.mean(dim=0)
    
    # 计算方差
    # Epistemic uncertainty
    epistemic_var = predictions.var(dim=0)
    # 总方差(包含 Aleatoric)
    total_var = epistemic_var + 1.0 / tau
    
    return mean, total_var, predictions

分类任务

对于分类任务,预测分布为:

预测的不确定性可通过熵度量:

def mc_dropout_classification(model, x, n_samples=50):
    """
    MC Dropout 分类
    """
    model.train()
    
    all_probs = []
    with torch.no_grad():
        for _ in range(n_samples):
            logits = model(x)
            probs = torch.softmax(logits, dim=-1)
            all_probs.append(probs)
    
    all_probs = torch.stack(all_probs)  # (T, batch, num_classes)
    
    # 平均预测概率
    mean_probs = all_probs.mean(dim=0)
    
    # 预测类别
    predictions = mean_probs.argmax(dim=-1)
    
    # 预测熵(不确定性)
    entropy = -torch.sum(mean_probs * torch.log(mean_probs + 1e-10), dim=-1)
    
    # 互信息(Epistemic uncertainty)
    mutual_info = entropy - all_probs.entropy(dim=-1).mean(dim=0)
    
    return predictions, mean_probs, entropy, mutual_info

完整实现

PyTorch 实现

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
 
class BayesianMLP(nn.Module):
    """
    用于 MC Dropout 的 MLP
    
    关键点:训练和测试时 Dropout 都保持开启
    """
    
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super().__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(p=dropout_rate)
        self.dropout_rate = dropout_rate
    
    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.fc3(x)  # 输出层通常不 Dropout
        return x
    
    def mc_predict(self, x, n_samples=100):
        """
        MC Dropout 预测
        
        Returns:
            mean: 预测均值
            variance: 预测方差
            std: 预测标准差
        """
        self.train()  # 关键:保持 Dropout 开启
        
        predictions = []
        with torch.no_grad():
            for _ in range(n_samples):
                pred = self(x)
                predictions.append(pred)
        
        predictions = torch.stack(predictions, dim=0)  # (T, batch, output)
        
        mean = predictions.mean(dim=0)
        variance = predictions.var(dim=0)
        std = torch.sqrt(variance + 1e-8)
        
        return mean, variance, std, predictions
    
    def predict_with_uncertainty(self, x, n_samples=100, threshold=0.1):
        """
        带不确定性量化的预测
        
        Args:
            threshold: 不确定性阈值,超过该值标记为不确定
        
        Returns:
            mean: 预测均值
            std: 预测标准差
            is_uncertain: 不确定性布尔标记
        """
        mean, variance, std, _ = self.mc_predict(x, n_samples)
        is_uncertain = std > threshold
        
        return mean, std, is_uncertain

训练与使用示例

# 训练(与标准训练相同)
model = BayesianMLP(input_dim=784, hidden_dim=256, output_dim=10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 
for epoch in range(10):
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        output = model(batch_x)  # Dropout 自动生效
        loss = F.cross_entropy(output, batch_y)
        loss.backward()
        optimizer.step()
 
# 推理(关键:保持 Dropout 开启)
model.eval()  # 设置为 eval 模式会关闭 Dropout!
model.train()  # 必须切换回 train 模式
 
mean, std, is_uncertain = model.predict_with_uncertainty(test_x, n_samples=100)
 
print(f"预测: {mean.argmax(dim=-1)}")
print(f"不确定性: {std.mean():.4f}")
print(f"不确定样本: {is_uncertain.sum()} / {len(is_uncertain)}")

回归任务示例

class BayesianRegressor(nn.Module):
    """
    异方差回归器 + MC Dropout
    
    同时估计 Aleatoric 和 Epistemic 不确定性
    """
    
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        
        # 均值输出
        self.mean_head = nn.Linear(hidden_dim, 1)
        # 对数方差输出(Aleatoric uncertainty)
        self.log_var_head = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        h = F.relu(self.dropout(self.fc1(x)))
        h = F.relu(self.dropout(self.fc2(h)))
        
        mean = self.mean_head(h)
        log_var = self.log_var_head(h)  # log(σ²)
        
        return mean, log_var
    
    def mc_predict(self, x, n_samples=50):
        """
        MC Dropout 预测
        
        Returns:
            predictive_mean: 预测均值
            predictive_var: 预测方差
            epistemic_var: 认知不确定性
            aleatoric_var: 偶然不确定性
        """
        self.train()
        
        means = []
        log_vars = []
        
        with torch.no_grad():
            for _ in range(n_samples):
                mean, log_var = self(x)
                means.append(mean)
                log_vars.append(log_var)
        
        means = torch.stack(means)  # (T, N, 1)
        log_vars = torch.stack(log_vars)  # (T, N, 1)
        
        # 预测均值
        predictive_mean = means.mean(dim=0)
        
        # Epistemic 方差(预测均值的变化)
        epistemic_var = means.var(dim=0)
        
        # Aleatoric 方差(平均数据噪声)
        aleatoric_var = torch.exp(log_vars).mean(dim=0)
        
        # 总预测方差
        predictive_var = epistemic_var + aleatoric_var
        
        return predictive_mean, predictive_var, epistemic_var, aleatoric_var
 
 
# 使用示例
model = BayesianRegressor(input_dim=1, hidden_dim=64)
 
# 训练
def heteroscedastic_loss(pred_mean, pred_log_var, target):
    """异方差损失函数"""
    var = torch.exp(pred_log_var)
    return 0.5 * (var.reciprocal() * (target - pred_mean)**2 + pred_log_var).mean()
 
# 推理
pred_mean, pred_var, epi_var, ale_var = model.mc_predict(x_test, n_samples=100)
 
print(f"预测均值: {pred_mean.squeeze()}")
print(f"预测标准差: {pred_var.sqrt().squeeze()}")
print(f"  - 认知不确定性: {epi_var.sqrt().squeeze()}")
print(f"  - 偶然不确定性: {ale_var.sqrt().squeeze()}")

理论分析

近似质量

MC Dropout 的近似质量取决于:

因素影响
采样次数 越大,方差估计越准确
Dropout 概率 影响后验近似的紧密程度
网络宽度更宽的网络近似更准确

通常建议 以获得稳定的方差估计。

与其他方法的对比

方法精度速度实现难度
MC Dropout中等
贝叶斯变分推断中等中等中等
MCMC 采样
深度集成中等可调

局限性

  1. 后验近似质量有限:Dropout 假设过于简化
  2. 需要多次前向传播:计算成本较高
  3. 方差估计有偏:尤其对于分类任务

Out-of-Distribution 检测

MC Dropout 的一个重要应用是检测分布外(Out-of-Distribution, OOD)样本

方法

分布外样本通常表现为高认知不确定性

def detect_ood(model, x_in_distribution, x_out_of_distribution, n_samples=50):
    """
    检测分布外样本
    """
    # 计算内部分布的不确定性
    _, _, epi_in = model.mc_predict(x_in_distribution, n_samples)
    
    # 计算外部分布的不确定性
    _, _, epi_out = model.mc_predict(x_out_of_distribution, n_samples)
    
    # OOD 检测指标
    # 方法1:预测方差
    ood_score_in = epi_in.mean(dim=-1)
    ood_score_out = epi_out.mean(dim=-1)
    
    # 方法2:最大预测概率的熵(分类任务)
    # ood_score = -max_p * log(max_p)
    
    return ood_score_in, ood_score_out
 
 
def ood_detection_auroc(model, x_id, x_ood, n_samples=50):
    """
    计算 OOD 检测的 AUROC
    """
    _, epi_id, _ = model.mc_predict(x_id, n_samples)
    _, epi_ood, _ = model.mc_predict(x_ood, n_samples)
    
    # 构建标签:1 = ID, 0 = OOD
    scores = torch.cat([epi_id.mean(dim=-1), epi_ood.mean(dim=-1)])
    labels = torch.cat([torch.ones(len(x_id)), torch.zeros(len(x_ood))])
    
    # 计算 AUROC
    from sklearn.metrics import roc_auc_score
    auroc = roc_auc_score(labels.numpy(), scores.numpy())
    
    return auroc

核心公式速查

概念公式
预测均值
Epistemic 方差
预测方差
分类预测

参考

相关文章

Footnotes

  1. Gal, Y., & Ghahramani, Z. (2016). “Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning”. ICML 2016.