概率对抗鲁棒性
概述
概率对抗鲁棒性(Probabilistic Adversarial Robustness)研究不确定性建模如何影响深度学习模型的对抗脆弱性。传统观点认为贝叶斯神经网络(BNN)的后验分布可以自然地提供鲁棒性,但最新研究表明这一假设需要重新审视。1
贝叶斯神经网络与对抗鲁棒性
传统观点 vs 现实
| 观点 | 内容 |
|---|---|
| 传统观点 | BNN 的权重不确定性自然提供对抗鲁棒性 |
| 现实发现 | SOTA BNN 方法对对抗攻击仍然脆弱 |
BNN 对抗脆弱性分析
def analyze_bnn_adversarial_vulnerability(bnn_model, x, y, epsilon=8/255):
"""
分析贝叶斯神经网络的对抗脆弱性
"""
results = {}
# MC Dropout
dropout_model = bnn_model # 假设使用 MC Dropout
preds_clean = []
preds_adv = []
for _ in range(100):
# 干净样本预测
pred_clean = dropout_model(x).argmax(dim=1)
preds_clean.append(pred_clean)
# 对抗样本预测
x_adv = pgd_attack(dropout_model, x, y, epsilon)
pred_adv = dropout_model(x_adv).argmax(dim=1)
preds_adv.append(pred_adv)
# 统计预测一致性
preds_clean = torch.stack(preds_clean)
preds_adv = torch.stack(preds_adv)
results['clean_entropy'] = calc_entropy_majority_vote(preds_clean)
results['adv_entropy'] = calc_entropy_majority_vote(preds_adv)
results['flip_rate'] = ((preds_clean.mode(0)[0] != preds_adv.mode(0)[0]).float().mean())
return results
def calc_entropy_majority_vote(predictions):
"""计算多数投票的熵"""
mode_pred = predictions.mode(0)[0]
agreement = (predictions == mode_pred).float().mean()
# 熵:低一致性 = 高熵 = 高不确定性
return -agreement * torch.log2(agreement + 1e-10)对抗鲁棒性认证
BNN 认证框架
class BayesianCertifiedRobustness:
"""
贝叶斯神经网络的认证鲁棒性
"""
def __init__(self, model, num_samples=1000):
self.model = model
self.num_samples = num_samples
def certify(self, x, epsilon, alpha=0.001):
"""
为 BNN 提供概率认证
"""
x = x.to(device)
# 采样权重
all_probs = []
for _ in range(self.num_samples):
with torch.no_grad():
logits = self.model(x)
probs = F.softmax(logits, dim=1)
all_probs.append(probs)
all_probs = torch.stack(all_probs) # [num_samples, batch, num_classes]
# 后验预测分布
mean_probs = all_probs.mean(dim=0) # 点估计
std_probs = all_probs.std(dim=0) # 预测不确定性
# 使用置信区间
lower = torch.quantile(all_probs, alpha/2, dim=0)
upper = torch.quantile(all_probs, 1-alpha/2, dim=0)
# 检查是否存在唯一正确类别
pred_class = mean_probs.argmax(dim=1)
pred_prob = mean_probs.gather(1, pred_class.unsqueeze(1)).squeeze()
# 认证:真实类别的下界 > 其他类别的上界
for c in range(mean_probs.size(1)):
if c != pred_class:
other_upper = upper[:, c]
self_lower = lower.gather(1, pred_class.unsqueeze(1)).squeeze()
certified_mask = self_lower > other_upper
# 认证半径
radius = self.compute_certified_radius(self_lower, other_upper)
return {
'prediction': pred_class,
'confidence': pred_prob,
'uncertainty': std_probs.mean(),
'radius': radius,
'certified': certified_mask
}
def compute_certified_radius(self, lower, upper):
"""
计算认证半径(简化版)
"""
# 基于概率差异的认证半径
margin = lower - upper
# 映射到扰动半径
radius = margin * 100 # 简化映射
return radius.clamp(min=0)PAC-Bayes 认证
PAC-Bayes 框架提供 BNN 的泛化保证:2
def pac_bayes_certified_radius(kl_bound, expected_risk_prior, delta=0.05):
"""
PAC-Bayes 认证半径
m: 样本数
kl_bound: KL(q||p) 上界
expected_risk_prior: 先验期望风险
"""
m = 10000 # 样本数
pac_bound = (kl_bound + np.log(2*np.sqrt(m)/delta)) / m
# 认证半径与泛化界的关系
certified_epsilon = np.sqrt(pac_bound)
return certified_epsilon
class PACBayesianRobustness:
"""
PAC-Bayes 对抗鲁棒性认证
"""
def __init__(self, model, prior_model, posterior_model):
self.model = model
self.prior = prior_model
self.posterior = posterior_model
def compute_kl_bound(self, x, y, epsilon):
"""
计算 KL 散度界
"""
# 采样
prior_probs = []
posterior_probs = []
for _ in range(100):
# 先验预测
prior_logit = self.prior(x)
prior_prob = F.softmax(prior_logit, dim=1)
prior_probs.append(prior_prob)
# 后验预测(带扰动)
x_adv = x + torch.randn_like(x) * epsilon
posterior_logit = self.posterior(x_adv)
posterior_prob = F.softmax(posterior_logit, dim=1)
posterior_probs.append(posterior_prob)
# 计算 KL 散度
prior_probs = torch.stack(prior_probs).mean(dim=0)
posterior_probs = torch.stack(posterior_probs).mean(dim=0)
# 逐元素 KL
kl = posterior_probs * (torch.log(posterior_probs + 1e-10) -
torch.log(prior_probs + 1e-10))
return kl.sum(dim=1).mean()CVaR 概率鲁棒性
Conditional Value at Risk 框架
CVaR 提供了一种概率鲁棒性度量:3
def cvar_robust_optimization(model, x, y, epsilon, alpha=0.05, num_samples=10000):
"""
CVaR 鲁棒优化
CVaR_alpha = E[loss | loss >= VaR_alpha]
"""
losses = []
predictions = []
for _ in range(num_samples):
# 采样对抗扰动
delta = torch.randn_like(x) * epsilon
x_adv = (x + delta).clamp(0, 1)
with torch.no_grad():
pred = model(x_adv).argmax(dim=1)
loss = F.cross_entropy(model(x_adv), y, reduction='none')
losses.append(loss)
predictions.append(pred)
losses = torch.stack(losses) # [num_samples, batch]
predictions = torch.stack(predictions)
# 计算 VaR 和 CVaR
batch_size = losses.size(1)
cvar_losses = []
for i in range(batch_size):
sample_losses = losses[:, i]
# VaR: alpha 分位数
var = torch.quantile(sample_losses, 1 - alpha)
# CVaR: 超过 VaR 的损失的平均值
cvar = sample_losses[sample_losses >= var].mean()
cvar_losses.append(cvar)
return torch.stack(cvar_losses)
def adaptive_cvar_certification(model, x, y, epsilon_range):
"""
自适应 CVaR 认证
"""
results = []
for epsilon in epsilon_range:
cvar = cvar_robust_optimization(model, x, y, epsilon)
# 找到使得 CVaR < threshold 的最大 epsilon
if cvar.mean() < 0.5: # 假设 0.5 是阈值
results.append({
'epsilon': epsilon,
'cvar': cvar.mean().item(),
'certified': True
})
return resultsAT-PR:概率鲁棒性对抗训练
AT-PR 是一种专门为概率鲁棒性设计的对抗训练方法:4
def at_pr_loss(model, x, y, epsilon=8/255, alpha=0.05, num_samples=100):
"""
AT-PR Loss: Adversarial Training for Probabilistic Robustness
"""
# 生成对抗样本
x_adv = pgd_attack(model, x, y, epsilon)
# 采样多个扰动
all_losses = []
for _ in range(num_samples):
delta = torch.randn_like(x) * epsilon / 2
x_sample = (x_adv + delta).clamp(0, 1)
loss = F.cross_entropy(model(x_sample), y)
all_losses.append(loss)
all_losses = torch.stack(all_losses)
# CVaR 损失
var = torch.quantile(all_losses, 1 - alpha)
cvar_loss = all_losses[all_losses >= var].mean()
# 结合标准交叉熵
clean_loss = F.cross_entropy(model(x), y)
return 0.5 * clean_loss + 0.5 * cvar_loss
class AT_PRTrainer:
"""
AT-PR 对抗训练器
"""
def __init__(self, model, epsilon=8/255, alpha=0.05, lr=1e-4):
self.model = model
self.epsilon = epsilon
self.alpha = alpha
self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def train_step(self, x, y):
self.optimizer.zero_grad()
loss = at_pr_loss(self.model, x, y, self.epsilon, self.alpha)
loss.backward()
self.optimizer.step()
return loss.item()
def evaluate(self, test_loader):
"""
评估概率鲁棒性
"""
total_clean = 0
total_robust = 0
total_cvar = 0
total = 0
for x, y in test_loader:
x, y = x.to(device), y.to(device)
# 干净准确率
with torch.no_grad():
clean_pred = self.model(x).argmax(dim=1)
total_clean += clean_pred.eq(y).sum().item()
# CVaR 鲁棒性
cvar = cvar_robust_optimization(self.model, x, y,
self.epsilon, self.alpha)
total_cvar += (cvar < 0.5).sum().item()
# PGD 鲁棒性
x_adv = pgd_attack(self.model, x, y, self.epsilon)
with torch.no_grad():
robust_pred = self.model(x_adv).argmax(dim=1)
total_robust += robust_pred.eq(y).sum().item()
total += x.size(0)
return {
'clean_acc': total_clean / total,
'robust_acc': total_robust / total,
'cvar_certified': total_cvar / total
}不确定性感知攻击
攻击不确定性建模
def uncertainty_aware_attack(model, x, y, num_samples=10, epsilon=8/255):
"""
利用模型不确定性选择攻击方向
"""
# 估计预测不确定性
uncertainties = []
grads = []
for _ in range(num_samples):
x.requires_grad = True
output = model(x)
loss = F.cross_entropy(output, y)
model.zero_grad()
loss.backward()
uncertainties.append(output.std(dim=0))
grads.append(x.grad.data)
# 高不确定性区域优先攻击
uncertainty_map = torch.stack(uncertainties).mean(dim=0)
grad_uncertainty = torch.stack(grads).mean(dim=0)
# 加权梯度
weighted_grad = grad_uncertainty * (1 + uncertainty_map)
return epsilon * weighted_grad.sign()对抗性不确定性估计
def adversarial_uncertainty_estimation(model, x, epsilon=8/255):
"""
对抗样本的不确定性估计
"""
# 生成多个对抗样本变体
num_variants = 20
adv_samples = []
for _ in range(num_variants):
# 使用不同的随机种子生成对抗样本
x_adv = pgd_attack(model, x, y, epsilon)
adv_samples.append(x_adv)
adv_samples = torch.stack(adv_samples)
# 评估不确定性
predictions = []
for x_adv in adv_samples:
with torch.no_grad():
pred = model(x_adv).argmax(dim=1)
predictions.append(pred)
predictions = torch.stack(predictions)
# 计算预测一致性
mode_pred = predictions.mode(0)[0]
consistency = (predictions == mode_pred).float().mean()
# 计算置信度
confidences = []
for x_adv in adv_samples:
with torch.no_grad():
logits = model(x_adv)
probs = F.softmax(logits, dim=1)
conf = probs.max(dim=1)[0]
confidences.append(conf)
confidences = torch.stack(confidences)
avg_confidence = confidences.mean()
return {
'consistency': consistency.item(),
'confidence': avg_confidence.item(),
'uncertainty': 1 - avg_confidence.item()
}实践指南
何时使用概率鲁棒性
| 场景 | 推荐方法 |
|---|---|
| 安全关键应用 | PAC-Bayes 认证 |
| 分布外检测 | MC Dropout + 置信度阈值 |
| 对抗鲁棒性 | AT-PR + CVaR |
| 认证保证 | 随机平滑 + 概率界 |
实现注意事项
class ProbabilisticRobustnessPipeline:
"""
完整的概率鲁棒性流程
"""
def __init__(self, model, device='cuda'):
self.model = model.to(device)
self.device = device
def train(self, train_loader, epsilon=8/255, epochs=100):
"""训练"""
optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
for epoch in range(epochs):
for x, y in train_loader:
x, y = x.to(self.device), y.to(self.device)
optimizer.zero_grad()
loss = at_pr_loss(self.model, x, y, epsilon)
loss.backward()
optimizer.step()
def certify(self, test_loader, epsilon=8/255):
"""认证"""
certifier = BayesianCertifiedRobustness(self.model)
results = []
for x, y in test_loader:
x, y = x.to(self.device), y.to(self.device)
result = certifier.certify(x, epsilon)
results.append(result)
return results
def attack_and_defend(self, x, y, epsilon=8/255):
"""攻击并评估"""
# 生成攻击
x_adv = pgd_attack(self.model, x, y, epsilon)
# 评估
with torch.no_grad():
clean_pred = self.model(x).argmax(dim=1)
adv_pred = self.model(x_adv).argmax(dim=1)
return {
'clean_correct': clean_pred.eq(y).item(),
'adversarial_correct': adv_pred.eq(y).item(),
'attacked': adv_pred != clean_pred
}相关主题
- bayesian-neural-networks-uncertainty — 贝叶斯神经网络不确定性
- certified-robustness-theory — 认证鲁棒性理论
- adversarial-training-methods — 对抗训练方法
- mc-dropout — MC Dropout 方法
- variational-inference-advanced — 变分推断进阶
参考文献
Footnotes
-
Feng, Y., et al. (2024). Attacking Bayes: On the Adversarial Robustness of Bayesian Neural Networks. OpenReview. https://openreview.net/forum?id=C6wj17VBnu ↩
-
Wicker, M., et al. (2021). Adversarial Robustness Certification for Bayesian Neural Networks. NeurIPS 2021 Workshop. https://proceedings.mlr.press/v130/wicker21a.html ↩
-
Feng, Y., et al. (2025). CVaR-based Adversarial Training. arXiv:2502.14833. https://arxiv.org/abs/2502.14833 ↩
-
Zhang, H., et al. (2025). AT-PR: Adversarial Training for Probabilistic Robustness. ICCV 2025. https://arxiv.org/abs/2502.14833 ↩