概率对抗鲁棒性

概述

概率对抗鲁棒性(Probabilistic Adversarial Robustness)研究不确定性建模如何影响深度学习模型的对抗脆弱性。传统观点认为贝叶斯神经网络(BNN)的后验分布可以自然地提供鲁棒性,但最新研究表明这一假设需要重新审视。1

贝叶斯神经网络与对抗鲁棒性

传统观点 vs 现实

观点内容
传统观点BNN 的权重不确定性自然提供对抗鲁棒性
现实发现SOTA BNN 方法对对抗攻击仍然脆弱

BNN 对抗脆弱性分析

def analyze_bnn_adversarial_vulnerability(bnn_model, x, y, epsilon=8/255):
    """
    分析贝叶斯神经网络的对抗脆弱性
    """
    results = {}
    
    # MC Dropout
    dropout_model = bnn_model  # 假设使用 MC Dropout
    preds_clean = []
    preds_adv = []
    
    for _ in range(100):
        # 干净样本预测
        pred_clean = dropout_model(x).argmax(dim=1)
        preds_clean.append(pred_clean)
        
        # 对抗样本预测
        x_adv = pgd_attack(dropout_model, x, y, epsilon)
        pred_adv = dropout_model(x_adv).argmax(dim=1)
        preds_adv.append(pred_adv)
    
    # 统计预测一致性
    preds_clean = torch.stack(preds_clean)
    preds_adv = torch.stack(preds_adv)
    
    results['clean_entropy'] = calc_entropy_majority_vote(preds_clean)
    results['adv_entropy'] = calc_entropy_majority_vote(preds_adv)
    results['flip_rate'] = ((preds_clean.mode(0)[0] != preds_adv.mode(0)[0]).float().mean())
    
    return results
 
 
def calc_entropy_majority_vote(predictions):
    """计算多数投票的熵"""
    mode_pred = predictions.mode(0)[0]
    agreement = (predictions == mode_pred).float().mean()
    # 熵:低一致性 = 高熵 = 高不确定性
    return -agreement * torch.log2(agreement + 1e-10)

对抗鲁棒性认证

BNN 认证框架

class BayesianCertifiedRobustness:
    """
    贝叶斯神经网络的认证鲁棒性
    """
    
    def __init__(self, model, num_samples=1000):
        self.model = model
        self.num_samples = num_samples
    
    def certify(self, x, epsilon, alpha=0.001):
        """
        为 BNN 提供概率认证
        """
        x = x.to(device)
        
        # 采样权重
        all_probs = []
        for _ in range(self.num_samples):
            with torch.no_grad():
                logits = self.model(x)
                probs = F.softmax(logits, dim=1)
                all_probs.append(probs)
        
        all_probs = torch.stack(all_probs)  # [num_samples, batch, num_classes]
        
        # 后验预测分布
        mean_probs = all_probs.mean(dim=0)  # 点估计
        std_probs = all_probs.std(dim=0)   # 预测不确定性
        
        # 使用置信区间
        lower = torch.quantile(all_probs, alpha/2, dim=0)
        upper = torch.quantile(all_probs, 1-alpha/2, dim=0)
        
        # 检查是否存在唯一正确类别
        pred_class = mean_probs.argmax(dim=1)
        pred_prob = mean_probs.gather(1, pred_class.unsqueeze(1)).squeeze()
        
        # 认证:真实类别的下界 > 其他类别的上界
        for c in range(mean_probs.size(1)):
            if c != pred_class:
                other_upper = upper[:, c]
                self_lower = lower.gather(1, pred_class.unsqueeze(1)).squeeze()
                
                certified_mask = self_lower > other_upper
                # 认证半径
                radius = self.compute_certified_radius(self_lower, other_upper)
        
        return {
            'prediction': pred_class,
            'confidence': pred_prob,
            'uncertainty': std_probs.mean(),
            'radius': radius,
            'certified': certified_mask
        }
    
    def compute_certified_radius(self, lower, upper):
        """
        计算认证半径(简化版)
        """
        # 基于概率差异的认证半径
        margin = lower - upper
        # 映射到扰动半径
        radius = margin * 100  # 简化映射
        return radius.clamp(min=0)

PAC-Bayes 认证

PAC-Bayes 框架提供 BNN 的泛化保证:2

def pac_bayes_certified_radius(kl_bound, expected_risk_prior, delta=0.05):
    """
    PAC-Bayes 认证半径
    
    m: 样本数
    kl_bound: KL(q||p) 上界
    expected_risk_prior: 先验期望风险
    """
    m = 10000  # 样本数
    pac_bound = (kl_bound + np.log(2*np.sqrt(m)/delta)) / m
    
    # 认证半径与泛化界的关系
    certified_epsilon = np.sqrt(pac_bound)
    
    return certified_epsilon
 
 
class PACBayesianRobustness:
    """
    PAC-Bayes 对抗鲁棒性认证
    """
    
    def __init__(self, model, prior_model, posterior_model):
        self.model = model
        self.prior = prior_model
        self.posterior = posterior_model
    
    def compute_kl_bound(self, x, y, epsilon):
        """
        计算 KL 散度界
        """
        # 采样
        prior_probs = []
        posterior_probs = []
        
        for _ in range(100):
            # 先验预测
            prior_logit = self.prior(x)
            prior_prob = F.softmax(prior_logit, dim=1)
            prior_probs.append(prior_prob)
            
            # 后验预测(带扰动)
            x_adv = x + torch.randn_like(x) * epsilon
            posterior_logit = self.posterior(x_adv)
            posterior_prob = F.softmax(posterior_logit, dim=1)
            posterior_probs.append(posterior_prob)
        
        # 计算 KL 散度
        prior_probs = torch.stack(prior_probs).mean(dim=0)
        posterior_probs = torch.stack(posterior_probs).mean(dim=0)
        
        # 逐元素 KL
        kl = posterior_probs * (torch.log(posterior_probs + 1e-10) - 
                               torch.log(prior_probs + 1e-10))
        
        return kl.sum(dim=1).mean()

CVaR 概率鲁棒性

Conditional Value at Risk 框架

CVaR 提供了一种概率鲁棒性度量:3

def cvar_robust_optimization(model, x, y, epsilon, alpha=0.05, num_samples=10000):
    """
    CVaR 鲁棒优化
    
    CVaR_alpha = E[loss | loss >= VaR_alpha]
    """
    losses = []
    predictions = []
    
    for _ in range(num_samples):
        # 采样对抗扰动
        delta = torch.randn_like(x) * epsilon
        x_adv = (x + delta).clamp(0, 1)
        
        with torch.no_grad():
            pred = model(x_adv).argmax(dim=1)
            loss = F.cross_entropy(model(x_adv), y, reduction='none')
            losses.append(loss)
            predictions.append(pred)
    
    losses = torch.stack(losses)  # [num_samples, batch]
    predictions = torch.stack(predictions)
    
    # 计算 VaR 和 CVaR
    batch_size = losses.size(1)
    cvar_losses = []
    
    for i in range(batch_size):
        sample_losses = losses[:, i]
        
        # VaR: alpha 分位数
        var = torch.quantile(sample_losses, 1 - alpha)
        
        # CVaR: 超过 VaR 的损失的平均值
        cvar = sample_losses[sample_losses >= var].mean()
        cvar_losses.append(cvar)
    
    return torch.stack(cvar_losses)
 
 
def adaptive_cvar_certification(model, x, y, epsilon_range):
    """
    自适应 CVaR 认证
    """
    results = []
    
    for epsilon in epsilon_range:
        cvar = cvar_robust_optimization(model, x, y, epsilon)
        
        # 找到使得 CVaR < threshold 的最大 epsilon
        if cvar.mean() < 0.5:  # 假设 0.5 是阈值
            results.append({
                'epsilon': epsilon,
                'cvar': cvar.mean().item(),
                'certified': True
            })
    
    return results

AT-PR:概率鲁棒性对抗训练

AT-PR 是一种专门为概率鲁棒性设计的对抗训练方法:4

def at_pr_loss(model, x, y, epsilon=8/255, alpha=0.05, num_samples=100):
    """
    AT-PR Loss: Adversarial Training for Probabilistic Robustness
    """
    # 生成对抗样本
    x_adv = pgd_attack(model, x, y, epsilon)
    
    # 采样多个扰动
    all_losses = []
    for _ in range(num_samples):
        delta = torch.randn_like(x) * epsilon / 2
        x_sample = (x_adv + delta).clamp(0, 1)
        
        loss = F.cross_entropy(model(x_sample), y)
        all_losses.append(loss)
    
    all_losses = torch.stack(all_losses)
    
    # CVaR 损失
    var = torch.quantile(all_losses, 1 - alpha)
    cvar_loss = all_losses[all_losses >= var].mean()
    
    # 结合标准交叉熵
    clean_loss = F.cross_entropy(model(x), y)
    
    return 0.5 * clean_loss + 0.5 * cvar_loss
 
 
class AT_PRTrainer:
    """
    AT-PR 对抗训练器
    """
    
    def __init__(self, model, epsilon=8/255, alpha=0.05, lr=1e-4):
        self.model = model
        self.epsilon = epsilon
        self.alpha = alpha
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    def train_step(self, x, y):
        self.optimizer.zero_grad()
        
        loss = at_pr_loss(self.model, x, y, self.epsilon, self.alpha)
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
    
    def evaluate(self, test_loader):
        """
        评估概率鲁棒性
        """
        total_clean = 0
        total_robust = 0
        total_cvar = 0
        total = 0
        
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            
            # 干净准确率
            with torch.no_grad():
                clean_pred = self.model(x).argmax(dim=1)
                total_clean += clean_pred.eq(y).sum().item()
            
            # CVaR 鲁棒性
            cvar = cvar_robust_optimization(self.model, x, y, 
                                           self.epsilon, self.alpha)
            total_cvar += (cvar < 0.5).sum().item()
            
            # PGD 鲁棒性
            x_adv = pgd_attack(self.model, x, y, self.epsilon)
            with torch.no_grad():
                robust_pred = self.model(x_adv).argmax(dim=1)
                total_robust += robust_pred.eq(y).sum().item()
            
            total += x.size(0)
        
        return {
            'clean_acc': total_clean / total,
            'robust_acc': total_robust / total,
            'cvar_certified': total_cvar / total
        }

不确定性感知攻击

攻击不确定性建模

def uncertainty_aware_attack(model, x, y, num_samples=10, epsilon=8/255):
    """
    利用模型不确定性选择攻击方向
    """
    # 估计预测不确定性
    uncertainties = []
    grads = []
    
    for _ in range(num_samples):
        x.requires_grad = True
        output = model(x)
        loss = F.cross_entropy(output, y)
        
        model.zero_grad()
        loss.backward()
        
        uncertainties.append(output.std(dim=0))
        grads.append(x.grad.data)
    
    # 高不确定性区域优先攻击
    uncertainty_map = torch.stack(uncertainties).mean(dim=0)
    grad_uncertainty = torch.stack(grads).mean(dim=0)
    
    # 加权梯度
    weighted_grad = grad_uncertainty * (1 + uncertainty_map)
    
    return epsilon * weighted_grad.sign()

对抗性不确定性估计

def adversarial_uncertainty_estimation(model, x, epsilon=8/255):
    """
    对抗样本的不确定性估计
    """
    # 生成多个对抗样本变体
    num_variants = 20
    adv_samples = []
    
    for _ in range(num_variants):
        # 使用不同的随机种子生成对抗样本
        x_adv = pgd_attack(model, x, y, epsilon)
        adv_samples.append(x_adv)
    
    adv_samples = torch.stack(adv_samples)
    
    # 评估不确定性
    predictions = []
    for x_adv in adv_samples:
        with torch.no_grad():
            pred = model(x_adv).argmax(dim=1)
            predictions.append(pred)
    
    predictions = torch.stack(predictions)
    
    # 计算预测一致性
    mode_pred = predictions.mode(0)[0]
    consistency = (predictions == mode_pred).float().mean()
    
    # 计算置信度
    confidences = []
    for x_adv in adv_samples:
        with torch.no_grad():
            logits = model(x_adv)
            probs = F.softmax(logits, dim=1)
            conf = probs.max(dim=1)[0]
            confidences.append(conf)
    
    confidences = torch.stack(confidences)
    avg_confidence = confidences.mean()
    
    return {
        'consistency': consistency.item(),
        'confidence': avg_confidence.item(),
        'uncertainty': 1 - avg_confidence.item()
    }

实践指南

何时使用概率鲁棒性

场景推荐方法
安全关键应用PAC-Bayes 认证
分布外检测MC Dropout + 置信度阈值
对抗鲁棒性AT-PR + CVaR
认证保证随机平滑 + 概率界

实现注意事项

class ProbabilisticRobustnessPipeline:
    """
    完整的概率鲁棒性流程
    """
    
    def __init__(self, model, device='cuda'):
        self.model = model.to(device)
        self.device = device
    
    def train(self, train_loader, epsilon=8/255, epochs=100):
        """训练"""
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        
        for epoch in range(epochs):
            for x, y in train_loader:
                x, y = x.to(self.device), y.to(self.device)
                
                optimizer.zero_grad()
                loss = at_pr_loss(self.model, x, y, epsilon)
                loss.backward()
                optimizer.step()
    
    def certify(self, test_loader, epsilon=8/255):
        """认证"""
        certifier = BayesianCertifiedRobustness(self.model)
        
        results = []
        for x, y in test_loader:
            x, y = x.to(self.device), y.to(self.device)
            result = certifier.certify(x, epsilon)
            results.append(result)
        
        return results
    
    def attack_and_defend(self, x, y, epsilon=8/255):
        """攻击并评估"""
        # 生成攻击
        x_adv = pgd_attack(self.model, x, y, epsilon)
        
        # 评估
        with torch.no_grad():
            clean_pred = self.model(x).argmax(dim=1)
            adv_pred = self.model(x_adv).argmax(dim=1)
        
        return {
            'clean_correct': clean_pred.eq(y).item(),
            'adversarial_correct': adv_pred.eq(y).item(),
            'attacked': adv_pred != clean_pred
        }

相关主题


参考文献

Footnotes

  1. Feng, Y., et al. (2024). Attacking Bayes: On the Adversarial Robustness of Bayesian Neural Networks. OpenReview. https://openreview.net/forum?id=C6wj17VBnu

  2. Wicker, M., et al. (2021). Adversarial Robustness Certification for Bayesian Neural Networks. NeurIPS 2021 Workshop. https://proceedings.mlr.press/v130/wicker21a.html

  3. Feng, Y., et al. (2025). CVaR-based Adversarial Training. arXiv:2502.14833. https://arxiv.org/abs/2502.14833

  4. Zhang, H., et al. (2025). AT-PR: Adversarial Training for Probabilistic Robustness. ICCV 2025. https://arxiv.org/abs/2502.14833