简介
信息瓶颈(Information Bottleneck, IB)理论是理解深度学习泛化和信息处理的核心框架。在持续学习场景中,IB理论提供了一个优雅的视角来理解任务间的知识压缩与遗忘的权衡。本文档系统建立IB框架下的持续学习理论,揭示任务特异性表示的信息论本质,分析压缩-遗忘权衡的最优策略,并与现有的持续学习方法建立联系。123
1. 信息瓶颈理论回顾
1.1 标准IB框架
信息瓶颈问题1:给定输入 和目标 ,寻找一个压缩表示 ,使得:
等价于优化:
其中 是压缩-预测权衡参数。
1.2 深度学习中的IB
变分信息瓶颈(VIB):使用变分近似优化IB目标:
深度IB:多层IB允许逐层压缩:
import torch
import torch.nn as nn
import torch.nn.functional as F
class VariationalInformationBottleneck(nn.Module):
"""
变分信息瓶颈层
实现: I(Z; X) - β * I(Z; Y) 的变分下界
"""
def __init__(self, input_dim, latent_dim, beta=1e-3, posterior_type='gaussian'):
"""
Args:
input_dim: 输入维度
latent_dim: 潜变量维度
beta: 压缩-预测权衡参数
posterior_type: 后验分布类型 ('gaussian' 或 'bernoulli')
"""
super().__init__()
self.beta = beta
self.posterior_type = posterior_type
# 编码器:输出均值和对数方差
self.encoder = nn.Sequential(
nn.Linear(input_dim, 256),
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU()
)
self.fc_mean = nn.Linear(128, latent_dim)
self.fc_logvar = nn.Linear(128, latent_dim)
# 解码器
self.decoder = nn.Sequential(
nn.Linear(latent_dim, 128),
nn.ReLU(),
nn.Linear(128, input_dim)
)
def encode(self, x):
"""编码:返回后验分布参数"""
h = self.encoder(x)
mean = self.fc_mean(h)
logvar = self.fc_logvar(h) # 对数方差
return mean, logvar
def reparameterize(self, mean, logvar):
"""重参数化技巧"""
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
return mean + eps * std
def forward(self, x):
"""
前向传播
Returns:
z: 潜变量样本
mean: 均值
logvar: 对数方差
kl: KL散度(压缩项)
recon: 重构
"""
mean, logvar = self.encode(x)
z = self.reparameterize(mean, logvar)
# KL散度项(近似 I(Z; X))
kl = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1)
# 重构(用于计算预测损失)
recon = self.decoder(z)
return z, mean, logvar, kl, recon
def vlb(self, x, y=None, recon=None, kl=None):
"""
计算变分下界
VLB = E[log p(y|z)] - β * KL(q(z|x) || r(z))
"""
if y is not None:
# 如果提供了标签,计算重构损失
if recon is None:
raise ValueError("recon must be provided when y is given")
# 预测损失(负对数似然)
recon_loss = F.mse_loss(recon, x, reduction='none').sum(dim=-1)
# VLB
vlb = -recon_loss - self.beta * kl
else:
# 仅计算压缩项
vlb = -self.beta * kl
return vlb.mean(), recon_loss.mean() if y is not None else None, kl.mean()2. 持续学习的信息论设置
2.1 多任务信息流
设置:考虑 个顺序任务 ,每个任务 定义为:
- 输入分布:
- 标签分布:
- 联合分布:
信息流图:
任务t
↓
┌─────────────┐
│ X_t │ ← 输入
└─────────────┘
↓ I(X_t; Z)
┌─────────────┐
│ Z │ ← 表示
└─────────────┘
↓ I(Z; Y_t)
┌─────────────┐
│ Y_t │ ← 标签
└─────────────┘
2.2 任务重叠与分离
定义1(任务重叠度):
- :任务 和 完全分离
- :任务 完全包含在 中
定义2(共享信息量):
2.3 顺序学习的信息约束
约束:在学习任务 时,只能访问:
- 当前任务数据:
- 历史信息:(通过表示 保留的旧任务信息)
class TaskInformationAnalyzer:
"""
任务信息分析器
"""
def __init__(self, model, device='cuda'):
self.model = model
self.device = device
self.task_information = {}
def estimate_mutual_information(self, task_loader, layer_name='features'):
"""
估计互信息 I(X; Z)
使用基于KL散度的估计
"""
self.model.eval()
# 收集表示
representations = []
inputs = []
with torch.no_grad():
for x, _ in task_loader:
x = x.to(self.device)
z = self.model.get_features(x, layer_name)
representations.append(z.cpu())
inputs.append(x.cpu())
representations = torch.cat(representations, dim=0)
inputs = torch.cat(inputs, dim=0).view(len(inputs), -1)
# 简化的互信息估计
# I(X; Z) ≈ H(Z) - H(Z|X)
# 使用高斯近似
z_mean = representations.mean(dim=0)
z_cov = torch.cov(representations.T)
# 熵估计
d = z_mean.shape[0]
h_z = 0.5 * d * np.log(2 * np.pi * np.e) + 0.5 * np.log(torch.det(z_cov + 1e-8).item())
# 简化:假设 H(Z|X) ≈ 0(确定性编码)
mi_xz = h_z
return mi_xz
def estimate_task_overlap(self, task1_loader, task2_loader, layer_name='features'):
"""
估计两个任务的重叠度
ω_ij = I(X_i; Y_j) / I(X_j; Y_j)
"""
# 简化的重叠度估计:使用表示相似性
self.model.eval()
representations1 = []
labels1 = []
with torch.no_grad():
for x, y in task1_loader:
x = x.to(self.device)
z = self.model.get_features(x, layer_name)
representations1.append(z.cpu())
labels1.append(y)
representations1 = torch.cat(representations1, dim=0)
labels1 = torch.cat(labels1, dim=0)
# 同样的过程对task2
representations2 = []
labels2 = []
with torch.no_grad():
for x, y in task2_loader:
x = x.to(self.device)
z = self.model.get_features(x, layer_name)
representations2.append(z.cpu())
labels2.append(y)
representations2 = torch.cat(representations2, dim=0)
labels2 = torch.cat(labels2, dim=0)
# 计算类别条件均值之间的重叠
overlap_scores = []
for c in torch.unique(labels1):
mask1 = labels1 == c
mask2 = labels2 == c
if mask1.sum() > 0 and mask2.sum() > 0:
mean1 = representations1[mask1].mean(dim=0)
mean2 = representations2[mask2].mean(dim=0)
# 重叠度 = 1 - 距离
distance = torch.norm(mean1 - mean2).item()
overlap = np.exp(-distance)
overlap_scores.append(overlap)
return np.mean(overlap_scores) if overlap_scores else 03. 任务特异性表示的信息论刻画
3.1 任务特异性信息的定义
定义3(任务 的特异性信息):
这度量了表示 中专门为任务 编码的信息量,超出之前任务的部分。
3.2 任务特异性与泛化
定理1(特异性-泛化关系):设 是给定之前表示后任务 的剩余信息。则:
含义:
- :存在正向迁移
- :需要为任务 学习的新信息
3.3 表示分离定理
定理2(信息分离)2:如果存在完美压缩 ,则:
含义:当压缩足够高效时,任务信息可以完全分离。
class TaskSpecificInformationEstimator:
"""
任务特异性信息估计器
"""
def __init__(self, model, device='cuda'):
self.model = model
self.device = device
self.representations_history = {}
def estimate_task_specific_info(self, task_id, task_loader,
prev_task_loaders=None):
"""
估计任务特异性信息
I_t^spec = I(Z; Y_t) - max_{s<t} I(Z; Y_s)
"""
# 获取当前任务的表示和标签
self.model.eval()
z_t, y_t = self._get_representations(task_loader)
# 估计 I(Z; Y_t)
i_zt_yt = self._estimate_mi(z_t, y_t)
# 如果有之前任务,减去最大的I(Z; Y_s)
max_prev_mi = 0
if prev_task_loaders:
for s, loader in enumerate(prev_task_loaders):
if s >= task_id:
continue
z_s, y_s = self._get_representations(loader)
mi_zs_ys = self._estimate_mi(z_s, y_s)
max_prev_mi = max(max_prev_mi, mi_zs_ys)
# 特异性信息
i_specific = i_zt_yt - max_prev_mi
return i_specific, i_zt_yt, max_prev_mi
def estimate_forward_transfer(self, source_task_id, target_task_loader,
source_task_loader):
"""
估计正向迁移 I(Z_{<t}; Y_t)
"""
# 获取源任务表示
z_source, y_source = self._get_representations(source_task_loader)
# 获取目标任务的表示
z_target, y_target = self._get_representations(target_task_loader)
# 估计迁移
# 简化:使用表示相关性
corr = torch.corrcoef(
torch.cat([z_source, z_target], dim=0)
)
# 实际应使用更复杂的信息论估计
return corr.mean().item()
def _get_representations(self, loader):
"""获取数据集的表示"""
representations = []
labels = []
with torch.no_grad():
for x, y in loader:
x = x.to(self.device)
z = self.model.get_features(x)
representations.append(z.cpu())
labels.append(y)
return torch.cat(representations, dim=0), torch.cat(labels, dim=0)
def _estimate_mi(self, z, y):
"""
估计互信息 I(Z; Y)
使用类别条件熵估计
"""
n_classes = len(torch.unique(y))
d_z = z.shape[1]
# 估计 p(y)
class_counts = torch.bincount(y)
p_y = class_counts.float() / len(y)
h_y = -torch.sum(p_y * torch.log(p_y + 1e-8))
# 估计 p(z|y)
h_z_given_y = 0
for c in torch.unique(y):
mask = y == c
if mask.sum() > 0:
z_c = z[mask]
# 熵估计
z_cov = torch.cov(z_c.T)
d = z_c.shape[1]
h_z_given_y += p_y[c] * (
0.5 * d * np.log(2 * np.pi * np.e) +
0.5 * torch.det(z_cov + 1e-8).log()
)
# I(Z; Y) = H(Z) - H(Z|Y) ≈ H(Y) - H(Z|Y)(简化)
mi = (h_y - h_z_given_y).item()
return max(0, mi)4. 压缩-遗忘权衡
4.1 压缩的必要性与代价
问题:为什么压缩会导致遗忘?
定理3(压缩-遗忘关系)3:设 是 的压缩表示, 是学习新任务后 的更新。则:
4.2 最优压缩-遗忘权衡
定理4(IB持续学习目标):持续学习的IB目标为:
其中:
- :压缩强度
- :记忆强度
- :旧任务的表示
4.3 权衡曲线
定义4(Pareto最优权衡):在压缩-遗忘空间中,满足:
定理5(Pareto前沿):Pareto前沿由以下方程描述:
其中 是与任务相关的常数。
class CompressionForgettingAnalyzer:
"""
压缩-遗忘权衡分析器
"""
def __init__(self, model, device='cuda'):
self.model = model
self.device = device
self.task_history = []
def compute_tradeoff_curve(self, task_sequence, beta_values):
"""
计算不同压缩强度下的压缩-遗忘权衡曲线
"""
results = []
for beta in beta_values:
# 训练模型(使用IB目标)
self._train_with_ib(task_sequence, beta)
# 计算压缩量
compression = self._estimate_compression()
# 计算遗忘量
forgetting = self._estimate_forgetting()
results.append({
'beta': beta,
'compression': compression,
'forgetting': forgetting
})
return results
def _train_with_ib(self, task_sequence, beta):
"""
使用IB目标训练
"""
for task_id, (train_loader, _) in enumerate(task_sequence):
optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
for x, y in train_loader:
x, y = x.to(self.device), y.to(self.device)
optimizer.zero_grad()
# 前向传播
z, mean, logvar, kl, _ = self.model(x)
# IB损失
# I(Z; Y) - β * I(Z; X) ≈ -MSE - β * KL
recon = self.model.decoder(z)
pred_loss = F.mse_loss(recon, x)
# IB损失
ib_loss = -pred_loss + beta * kl.mean()
ib_loss.backward()
optimizer.step()
def _estimate_compression(self):
"""
估计压缩量
Compression ≈ KL(q(z|x) || r(z)) 的平均值
"""
# 简化为使用KL散度的平均值
return np.random.random() # 占位
def _estimate_forgetting(self):
"""
估计遗忘量
Forgetting = (1/(t-1)) * Σ [L_i(initial) - L_i(final)]
"""
# 简化为使用性能下降的平均值
return np.random.random() # 占位
def fit_pareto_frontier(self, tradeoff_data):
"""
拟合Pareto前沿
Forgetting = exp(-Compression/α) * C_0
"""
from scipy.optimize import curve_fit
def pareto_func(x, alpha, c0):
return c0 * np.exp(-x / alpha)
compressions = [d['compression'] for d in tradeoff_data]
forgetting = [d['forgetting'] for d in tradeoff_data]
params, _ = curve_fit(pareto_func, compressions, forgetting)
alpha, c0 = params
return alpha, c0, pareto_func5. 任务感知的信息瓶颈
5.1 条件信息瓶颈
定义5(任务条件IB):
其中 是任务权重, 是任务 的表示。
5.2 任务相关与任务无关信息
定理6(信息分解):
且:
5.3 自适应压缩
定理7(自适应压缩策略):最优压缩强度应与任务重叠度负相关:
其中 是与之前任务的平均重叠度。
class TaskAwareInformationBottleneck(nn.Module):
"""
任务感知的信息瓶颈
为每个任务学习自适应的压缩强度
"""
def __init__(self, input_dim, latent_dim, n_tasks, hidden_dim=128):
super().__init__()
self.n_tasks = n_tasks
self.latent_dim = latent_dim
# 每个任务的自适应β
self.task_betas = nn.Parameter(torch.ones(n_tasks) * 1e-3)
# 编码器
self.encoder = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU()
)
self.fc_mean = nn.ModuleList([
nn.Linear(hidden_dim, latent_dim) for _ in range(n_tasks)
])
self.fc_logvar = nn.ModuleList([
nn.Linear(hidden_dim, latent_dim) for _ in range(n_tasks)
])
# 解码器
self.decoder = nn.Sequential(
nn.Linear(latent_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, input_dim)
)
# 任务判别器(用于分解任务相关/共享信息)
self.task_discriminator = nn.Sequential(
nn.Linear(latent_dim, 64),
nn.ReLU(),
nn.Linear(64, n_tasks)
)
def forward(self, x, task_id):
"""
前向传播
Args:
x: 输入
task_id: 当前任务ID
"""
h = self.encoder(x)
# 获取当前任务的分布参数
mean = self.fc_mean[task_id](h)
logvar = self.fc_logvar[task_id](h)
# 重参数化
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
z = mean + eps * std
# 压缩项
kl = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1)
# 重构
recon = self.decoder(z)
# 任务相关/共享分解
task_logits = self.task_discriminator(z)
return z, recon, kl, task_logits, self.task_betas[task_id]
def compute_ib_loss(self, x, y, task_id, recon_target=None):
"""
计算IB损失
L = -I(Z; Y) + β * I(Z; X)
"""
z, recon, kl, task_logits, beta = self.forward(x, task_id)
# 预测损失(近似 -I(Z; Y))
if recon_target is not None:
pred_loss = F.mse_loss(recon, recon_target, reduction='none').sum(dim=-1)
else:
# 使用交叉熵作为预测损失
pred_loss = F.cross_entropy(
self.task_discriminator(z),
torch.full((x.size(0),), task_id, dtype=torch.long, device=x.device)
)
# IB损失
ib_loss = pred_loss + beta * kl
return ib_loss.mean(), z, kl.mean(), beta
def adapt_beta(self, task_id, overlap_score):
"""
根据任务重叠度自适应调整β
β_t* ∝ 1/ω_t,avg
"""
with torch.no_grad():
# 简化的自适应规则
new_beta = 1.0 / (overlap_score + 1e-8)
self.task_betas[task_id] = new_beta.clamp(1e-5, 1e-1)
return self.task_betas[task_id]6. 深度持续信息瓶颈
6.1 多层IB目标
定理8(多层IB分解):设网络有 层,表示为 。则:
其中 是层权重, 是压缩强度。
6.2 表示压缩路径
定理9(压缩路径唯一性):对于给定的压缩-预测权衡,最优压缩路径是唯一的,满足:
6.3 任务切换的动态IB
定理10(动态IB方程):当任务从 切换到 时,表示 的演化满足:
class DeepContinualIB(nn.Module):
"""
深度持续信息瓶颈
为每层学习任务自适应的压缩
"""
def __init__(self, input_dim, hidden_dims, latent_dim, n_tasks):
super().__init__()
self.n_tasks = n_tasks
self.L = len(hidden_dims) + 1 # 层数
# 每层的任务感知压缩强度
self.layer_betas = nn.Parameter(
torch.ones(self.L, n_tasks) * 1e-3
)
# 每层的权重
self.layer_alphas = nn.Parameter(torch.ones(self.L))
# 构建网络
layers = []
prev_dim = input_dim
for hidden_dim in hidden_dims:
layers.extend([
nn.Linear(prev_dim, hidden_dim),
nn.ReLU()
])
prev_dim = hidden_dim
self.encoder = nn.Sequential(*layers)
# 每层的潜变量
self.fc_means = nn.ModuleList([
nn.Linear(prev_dim if i == 0 else hidden_dims[i-1], latent_dim)
for i in range(self.L)
])
self.fc_logvars = nn.ModuleList([
nn.Linear(prev_dim if i == 0 else hidden_dims[i-1], latent_dim)
for i in range(self.L)
])
# 每层的解码器
self.decoders = nn.ModuleList([
nn.Sequential(
nn.Linear(latent_dim, hidden_dims[-1] if i == 0 else hidden_dims[i-1]),
nn.ReLU(),
nn.Linear(hidden_dims[-1] if i == 0 else hidden_dims[i-1], input_dim)
) if i == self.L - 1 else nn.Identity()
for i in range(self.L)
])
def forward(self, x, task_id):
"""
前向传播,返回每层的表示
"""
h = x
representations = []
kls = []
for l in range(self.L):
# 编码
if l == 0:
h_enc = self.encoder(x)
else:
h_enc = h # 使用上一层的输出
mean = self.fc_means[l](h_enc)
logvar = self.fc_logvars[l](h_enc)
# 重参数化
z = mean + torch.randn_like(mean) * torch.exp(0.5 * logvar)
representations.append(z)
# KL散度
kl = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1)
kls.append(kl)
# 更新隐藏状态
h = z
return representations, kls
def compute_deep_ib_loss(self, x, task_id, prev_task_id=None, representations=None, kls=None):
"""
计算深度IB损失
L = Σ_l [α_l * I(Z_l; Y) - β_l * I(Z_l; X)]
"""
if representations is None or kls is None:
representations, kls = self.forward(x, task_id)
total_loss = 0
layer_losses = []
for l in range(self.L):
alpha = torch.softmax(self.layer_alphas, dim=0)[l]
beta = self.layer_betas[l, task_id]
# 预测损失(近似 -I(Z_l; Y))
# 使用重构损失或交叉熵
pred_loss = F.mse_loss(representations[l], x, reduction='none').sum(dim=-1)
# IB损失
layer_loss = alpha * pred_loss + beta * kls[l]
layer_losses.append(layer_loss.mean())
total_loss += layer_loss.mean()
# 如果有之前任务的表示,添加记忆保持损失
if prev_task_id is not None:
# 简化的记忆损失:保持之前任务的表示
memory_loss = 0
# 实际应存储并比较历史表示
total_loss = total_loss + 0.1 * memory_loss
return total_loss, layer_losses7. 与现有方法的联系
7.1 EWC的IB解释
定理11(EWC作为IB近似):EWC的正则化项对应于:
含义:EWC通过惩罚参数变化来近似保持旧任务的信息。
7.2 回放的IB解释
定理12(回放作为信息补充):回放通过向缓冲区添加样本来增加:
含义:缓冲区样本帮助恢复旧任务的信息。
7.3 蒸馏的IB解释
定理13(知识蒸馏作为互信息最大化):
含义:蒸馏损失最大化新旧表示之间的互信息,即保持旧知识。
class IBContinualLearning:
"""
基于信息瓶颈的持续学习方法
"""
def __init__(self, model, beta=1e-3, memory_strength=0.1):
self.model = model
self.beta = beta
self.memory_strength = memory_strength
# 存储历史任务的统计信息
self.task_statistics = {}
def compute_ib_loss(self, x, y, task_id):
"""
计算基于IB的持续学习损失
"""
# 获取当前任务的表示
z, recon, kl, pred_loss, beta_t = self.model.compute_ib_loss(x, y, task_id)
# IB损失
ib_loss = pred_loss + beta_t * kl
# 如果有历史任务,添加记忆保持损失
if task_id > 0 and task_id in self.task_statistics:
memory_loss = self.compute_memory_loss(z, task_id)
ib_loss = ib_loss + self.memory_strength * memory_loss
return ib_loss, z, kl.mean(), beta_t
def compute_memory_loss(self, z, task_id):
"""
计算记忆保持损失
L_memory = KL(p(z|task) || q(z|history))
"""
memory_loss = 0
for prev_task_id in range(task_id):
if prev_task_id in self.task_statistics:
prev_mean = self.task_statistics[prev_task_id]['mean']
prev_logvar = self.task_statistics[prev_task_id]['logvar']
# 简化的KL散度
current_mean = z.mean(dim=0)
current_logvar = torch.log(z.var(dim=0) + 1e-8)
# KL(N(mu1, sigma1) || N(mu2, sigma2))
kl = 0.5 * (
torch.exp(current_logvar - prev_logvar) +
(prev_mean - current_mean).pow(2) / (torch.exp(prev_logvar) + 1e-8) -
z.shape[1] +
prev_logvar - current_logvar
)
memory_loss = memory_loss + kl.sum()
return memory_loss
def update_task_statistics(self, z, task_id):
"""
更新任务统计信息
"""
with torch.no_grad():
self.task_statistics[task_id] = {
'mean': z.mean(dim=0).clone(),
'logvar': torch.log(z.var(dim=0) + 1e-8).clone(),
'count': z.shape[0]
}
def adapt_beta(self, task_id, overlap_score):
"""
根据任务重叠度自适应调整β
"""
# β与重叠度负相关
new_beta = self.beta / (overlap_score + 1e-8)
self.model.task_betas[task_id] = new_beta.clamp(1e-5, 1e-1)
return self.model.task_betas[task_id]8. 最优持续学习策略的信息论分析
8.1 理论最优策略
定理14(信息论最优策略):最优持续学习策略 满足:
其中 是条件熵,度量表示的”新信息”量。
8.2 任务排序的影响
定理15(任务排序的信息论界限):
含义:将高重叠任务放在一起可以减少信息损失。
8.3 零遗忘的条件
定理16(零遗忘的必要条件):实现零遗忘的充分必要条件是:
含义:表示必须完全由标签决定(无无关信息),且旧知识必须完全保持。
class OptimalContinualLearning:
"""
最优持续学习策略设计
"""
def __init__(self, model):
self.model = model
self.task_information_matrix = None
def compute_optimal_ordering(self, task_loaders):
"""
计算最优任务排序
目标:最小化信息损失
"""
n_tasks = len(task_loaders)
# 计算任务重叠矩阵
overlap_matrix = np.zeros((n_tasks, n_tasks))
for i in range(n_tasks):
for j in range(n_tasks):
if i != j:
# 估计任务重叠度
overlap_matrix[i, j] = self._estimate_overlap(
task_loaders[i], task_loaders[j]
)
# 贪心排序:高重叠任务放在一起
ordered = []
remaining = set(range(n_tasks))
# 从重叠度最大的任务对开始
while remaining:
if not ordered:
# 选择第一个任务(任意)
ordered.append(remaining.pop())
else:
last = ordered[-1]
# 选择与最后一个任务重叠度最大的
best_next = min(
remaining,
key=lambda t: -overlap_matrix[last, t]
)
ordered.append(best_next)
remaining.remove(best_next)
return ordered, overlap_matrix
def _estimate_overlap(self, loader1, loader2):
"""
估计两个任务的重叠度
"""
# 简化的重叠估计
# 实际应使用更复杂的信息论估计
return np.random.random()
def verify_zero_forgetting_condition(self, z, y, z_old, y_old):
"""
验证零遗忘条件
条件1: I(Z; X|Y) = 0 (表示完全由标签决定)
条件2: I(Z; Y_old) = I(Z_old; Y_old) (旧知识完全保持)
"""
# 条件1:使用互信息的链式法则
# I(Z; X, Y) = I(Z; Y) + I(Z; X|Y)
# 若 I(Z; X|Y) = 0,则 I(Z; X) = I(Z; Y)
# 简化的验证
cond1_satisfied = True # 简化
# 条件2:比较新旧表示与旧任务标签的互信息
# 简化:使用相关性
corr_new = torch.corrcoef(
torch.cat([z, y_old], dim=0)
).mean()
corr_old = torch.corrcoef(
torch.cat([z_old, y_old], dim=0)
).mean()
cond2_satisfied = abs(corr_new - corr_old) < 0.1
return cond1_satisfied, cond2_satisfied9. 总结
核心定理
| 定理 | 内容 | 实践意义 |
|---|---|---|
| 定理1 | 特异性-泛化关系 | 迁移来自共享表示 |
| 定理2 | 信息分离 | 高效压缩下任务信息可分离 |
| 定理3 | 压缩-遗忘关系 | 压缩导致遗忘的机制 |
| 定理4 | IB持续学习目标 | 统一的持续学习目标函数 |
| 定理5 | Pareto最优权衡 | 遗忘随压缩指数衰减 |
| 定理11 | EWC的IB解释 | EWC近似信息保持 |
| 定理12 | 回放的IB解释 | 回放补充信息损失 |
| 定理14 | 最优策略 | 信息论最优持续学习目标 |
实践建议
- 压缩强度:根据任务重叠度自适应调整β
- 任务排序:高重叠任务应相邻学习
- 表示设计:最大化任务相关信息,最小化标签无关信息
- 记忆保持:使用蒸馏或正则化保持旧知识
理论启示
- IB框架提供了理解持续学习的统一视角
- 压缩-遗忘权衡是持续学习的核心约束
- 最优策略可通过信息论优化推导
参考资料
相关阅读: