认证鲁棒性理论
概述
认证鲁棒性(Certified Robustness)旨在为深度学习模型提供可证明的鲁棒性保证:对于输入 周围的 -ball 内的任意扰动,模型的预测都是一致的。与经验鲁棒性不同,认证鲁棒性提供确定性保证,不依赖于特定的攻击方法。
认证鲁棒性的数学框架
形式化定义
对于分类器 ,如果对所有 都有:
则称 在 处对 范数扰动具有半径 的认证鲁棒性。
认证准确率 vs 鲁棒准确率
其中 是针对样本的最优对抗扰动。
关键不等式:
随机平滑(Randomized Smoothing)
随机平滑是当前最流行的认证方法之一,由 Cohen 等人于 2019 年提出。1
核心思想
将原始分类器 转换为平滑分类器 :
其中 是高斯噪声。
认证保证
定理(Cohen et al., 2019):
如果:
那么当 时, 在 处对所有 的扰动都有认证鲁棒性,其中:
其中 是标准正态分布的逆CDF。
实现代码
import numpy as np
from scipy.stats import norm
class RandomizedSmoothing:
"""
Randomized Smoothing for Certified Robustness
"""
def __init__(self, base_classifier, sigma=0.5, num_samples=1000, alpha=0.001):
self.base_classifier = base_classifier
self.sigma = sigma
self.num_samples = num_samples
self.alpha = alpha # 显著性水平
def predict(self, x):
"""预测:多数投票"""
x = x.to(self.device)
votes = {}
for _ in range(self.num_samples):
noise = torch.randn_like(x) * self.sigma
pred = self.base_classifier(x + noise).argmax().item()
votes[pred] = votes.get(pred, 0) + 1
return max(votes, key=votes.get)
def certify(self, x, n0=100, n=1000, alpha=0.001):
"""
认证:返回预测类别和鲁棒半径
Args:
n0: 用于点估计的样本数
n: 用于置信区间的样本数
"""
x = x.to(self.device)
# 第一阶段:预测
predictions = []
for _ in range(n0):
noise = torch.randn_like(x) * self.sigma
pred = self.base_classifier(x + noise).argmax().item()
predictions.append(pred)
c_A = max(set(predictions), key=predictions.count)
# 第二阶段:计算 p_A 的下界
counts = {c_A: 0}
for _ in range(n):
noise = torch.randn_like(x) * self.sigma
pred = self.base_classifier(x + noise).argmax().item()
counts[pred] = counts.get(pred, 0) + 1
# 使用 Clopper-Pearson 置信区间
n_A = counts[c_A]
p_A_lower = beta.ppf(alpha/2, n_A, n - n_A + 1)
if p_A_lower < 0.5:
return c_A, 0.0, n_A
# 计算其他类别的上界
p_B_upper = 0
for c, count in counts.items():
if c != c_A:
p_c_upper = beta.ppf(1 - alpha/2, count + 1, n - count)
p_B_upper = max(p_B_upper, p_c_upper)
# 计算认证半径
radius = self.sigma * norm.ppf(p_A_lower) - self.sigma * norm.ppf(p_B_upper)
return c_A, max(0, radius), n_A
def certify_batch(self, dataloader):
"""批量认证"""
results = []
for x, y in dataloader:
for i in range(x.size(0)):
pred, radius, n_A = self.certify(x[i:i+1])
results.append({
'true': y[i].item(),
'pred': pred,
'radius': radius,
'n_A': n_A
})
return resultsIBP(Interval Bound Propagation)
IBP 通过区间传播计算神经网络输出的紧致边界。2
前向传播
class IntervalBounds:
"""区间边界传播"""
def __init__(self, lower, upper):
self.lower = lower
self.upper = upper
self.center = (lower + upper) / 2
self.radius = (upper - lower) / 2
@staticmethod
def from_epsilon(x, epsilon):
"""从 epsilon 创建区间"""
lower = torch.clamp(x - epsilon, 0, 1)
upper = torch.clamp(x + epsilon, 0, 1)
return IntervalBounds(lower, upper)
class IBPNetwork:
"""
使用 IBP 进行认证的神经网络
"""
def forward_interval(self, x_interval):
"""前向区间传播"""
x = x_interval.center
# 线性层
lower, upper = x_interval.lower, x_interval.upper
# 线性变换的区间边界
W_pos = F.relu(self.weight)
W_neg = self.weight - W_pos
# 广播 W @ x 的上下界
mid = W_pos @ x + W_neg @ x
rad = (W_pos - W_neg).abs() @ x_interval.radius
lower = mid - rad + self.bias
upper = mid + rad + self.bias
return IntervalBounds(lower, upper)
def certify(self, x, epsilon):
"""
认证给定输入和扰动半径
"""
x_interval = IntervalBounds.from_epsilon(x, epsilon)
# 前向传播获取输出区间
for layer in self.layers:
x_interval = layer.forward_interval(x_interval)
# 检查是否所有输出都被正确分类
lower, upper = x_interval.lower, x_interval.upper
# 真实类别的下界应大于其他类别的上界
pred = lower.argmax(dim=1)
# 计算认证
certified = []
for i in range(x.size(0)):
c = pred[i]
margin = lower[i, c] - upper[i, :].max()
certified.append(margin.item())
return pred, certifiedCROWN-IBP 组合
def crown_ibp_training(model, x, y, epsilon, num_iterations=10):
"""
CROWN-IBP 混合认证训练
早期使用 IBP(宽松但稳定)
后期使用 CROWN(紧致但不稳定)
"""
for i in range(num_iterations):
# 根据迭代次数混合
alpha = i / num_iterations
if alpha < 0.5:
# IBP 边界
bounds = ibp_bounds(model, x, epsilon)
else:
# CROWN 边界
bounds = crown_bounds(model, x, epsilon)
# 计算损失
loss = robust_loss(bounds, y)
loss.backward()
optimizer.step()概率鲁棒性
CVaR 认证
CVaR(Conditional Value at Risk)提供概率鲁棒性保证:3
def cvar_certification(model, x, y, epsilon, alpha=0.05, num_samples=10000):
"""
CVaR 概率鲁棒性认证
"""
x = x.to(device)
losses = []
for _ in range(num_samples):
noise = torch.randn_like(x) * epsilon
x_adv = x + noise
with torch.no_grad():
pred = model(x_adv).argmax()
loss = F.cross_entropy(model(x_adv), y)
losses.append(loss.item() if pred == y else 1.0)
# 计算 CVaR
losses = np.array(sorted(losses))
cutoff = int(alpha * len(losses))
cvar = losses[:cutoff].mean()
return cvarUCAN:各向异性随机平滑
UCAN 改进了经典随机平滑,使用各向异性噪声分布:4
class UCANSmoothing:
"""
UnCertified Anisotropic Noise for randomized smoothing
"""
def __init__(self, model, cov_matrix, num_samples=1000):
self.model = model
self.cov_matrix = cov_matrix # 协方差矩阵
self.num_samples = num_samples
self.dist = torch.distributions.MultivariateNormal(
torch.zeros(cov_matrix.shape[0]),
cov_matrix
)
def certify(self, x, delta=0.1):
"""
UCAN 认证
"""
# 采样
samples = self.dist.sample((self.num_samples,))
x_flat = x.view(x.size(0), -1)
predictions = []
for i in range(self.num_samples):
noise = samples[i].view_as(x)
pred = self.model(x + noise).argmax().item()
predictions.append(pred)
# 统计
from collections import Counter
counts = Counter(predictions)
c_A = max(counts, key=counts.get)
p_A = counts[c_A] / self.num_samples
# 认证半径
# UCAN 使用 Mahalanobis 距离
mahalanobis = torch.sqrt(samples @ torch.linalg.inv(self.cov_matrix) @ samples.T)
# 返回认证结果
return c_A, p_A, mahalanobisCTBENCH:认证训练基准
CTBENCH 提供了认证训练的系统评估框架:5
class CTBENCH:
"""
Certifiable Training Benchmark
"""
def __init__(self, model):
self.model = model
def evaluate_certified_accuracy(self, test_loader, epsilons, method='ibp'):
"""
评估多epsilon下的认证准确率
"""
results = {}
for eps in epsilons:
correct = 0
total = 0
for x, y in test_loader:
x, y = x.to(device), y.to(device)
if method == 'ibp':
_, certified = self.model.ibp_certify(x, eps)
elif method == 'crown':
_, certified = self.model.crown_certify(x, eps)
elif method == 'smoothing':
_, certified = self.model.smoothing_certify(x, eps)
for i, cert in enumerate(certified):
if cert > 0:
correct += 1
total += 1
results[eps] = correct / total
return results
def generate_report(self, results):
"""生成评估报告"""
report = "Certifiable Accuracy Results\n"
report += "=" * 50 + "\n"
for eps, acc in results.items():
report += f"ε = {eps:.4f}: {acc:.2%}\n"
return report认证方法对比
| 方法 | 认证类型 | 紧致度 | 计算开销 | 可扩展性 |
|---|---|---|---|---|
| 随机平滑 | 概率 | 中等 | 中 | 高 |
| IBP | 确定 | 松弛 | 低 | 高 |
| CROWN | 确定 | 紧致 | 高 | 中 |
| CROWN-IBP | 确定 | 紧致 | 中 | 中 |
| SDP | 确定 | 最紧致 | 很高 | 低 |
| MILP | 确定 | 最紧致 | 极高 | 很低 |
认证训练实践
def certified_adversarial_training(model, dataloader, epsilon=8/255,
method='ibp', epochs=200):
"""
认证对抗训练
"""
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
for epoch in range(epochs):
model.train()
total_loss = 0
for x, y in dataloader:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
# 生成对抗样本(用于训练)
x_adv = pgd_attack(model, x, y, epsilon)
# 计算认证损失
if method == 'ibp':
bounds = ibp_bounds(model, x, epsilon)
loss = robust_loss(bounds, y)
else:
# CROWN 损失
loss = F.cross_entropy(model(x_adv), y)
loss.backward()
optimizer.step()
total_loss += loss.item()
# 评估认证准确率
if epoch % 10 == 0:
cert_acc = evaluate_certified_accuracy(model, test_loader,
epsilon, method)
print(f"Epoch {epoch}: Loss={total_loss/len(dataloader):.4f}, "
f"Certified Acc={cert_acc:.2%}")相关主题
- adversarial-robustness-fundamentals — 对抗鲁棒性基础
- adversarial-training-methods — 对抗训练方法
- adversarial-attack-methods — 对抗攻击方法
- bayesian-neural-networks-uncertainty — 贝叶斯神经网络的认证
参考文献
Footnotes
-
Cohen, J., et al. (2019). Certified Adversarial Robustness via Randomized Smoothing. ICML 2019. https://arxiv.org/abs/1904.02705 ↩
-
Gowal, S., et al. (2018). On the Effectiveness of Interval Bound Propagation for Training Verifiably Robust Models. NeurIPS 2018 Workshop. https://arxiv.org/abs/1810.12715 ↩
-
Feng, Y., et al. (2025). CVaR-based Adversarial Training. arXiv:2502.14833. https://arxiv.org/abs/2502.14833 ↩
-
Liu, J., et al. (2025). UCAN: Anisotropic Noise Randomized Smoothing for Certified Robustness. arXiv:2510.19977. https://arxiv.org/abs/2510.19977 ↩
-
Mao, C., et al. (2025). CTBENCH: A Benchmark for Certifiable Training. arXiv:2406.04848. https://arxiv.org/abs/2406.04848 ↩