SVD在深度学习中的应用
奇异值分解(Singular Value Decomposition, SVD)是线性代数的核心工具,在深度学习中广泛应用于模型压缩、权重分析、表示学习和优化算法的设计。1
SVD基础回顾
定义
任意矩阵 可以分解为:
其中:
- :左奇异向量矩阵(正交)
- :奇异值矩阵(对角线元素为奇异值 )
- :右奇异向量矩阵(正交)
关键性质
| 性质 | 公式 | 意义 |
|---|---|---|
| 谱范数 | $\ | A\ |
| 核范数 | $\ | A\ |
| Frobenius范数 | $\ | A\ |
| 秩 | 非零奇异值个数 |
基于SVD的模型压缩
截断SVD(Truncated SVD)
保留前 个最大的奇异值,获得矩阵的最佳 秩近似:
压缩效果:参数量从 减少到 。
import numpy as np
from scipy.linalg import svd
def truncated_svd(A, k):
"""
截断SVD:保留前k个奇异值
Args:
A: 输入矩阵 (m, n)
k: 保留的奇异值个数
Returns:
U_k, S_k, Vt_k, compression_ratio
"""
U, s, Vt = svd(A, full_matrices=False)
# 取前k个奇异值
U_k = U[:, :k]
S_k = s[:k]
Vt_k = Vt[:k, :]
# 计算压缩比
original_params = A.shape[0] * A.shape[1]
compressed_params = k * (A.shape[0] + A.shape[1])
compression_ratio = original_params / compressed_params
return U_k, S_k, Vt_k, compression_ratio
# 示例:压缩一个全连接层
# 假设权重矩阵 W: (4096, 4096),压缩到 k=64
W = np.random.randn(4096, 4096)
U_k, S_k, Vt_k, ratio = truncated_svd(W, k=64)
print(f"压缩比: {ratio:.1f}x") # 约32x压缩
# 重构矩阵
W_reconstructed = U_k @ np.diag(S_k) @ Vt_k
reconstruction_error = np.linalg.norm(W - W_reconstructed, 'fro') / np.linalg.norm(W, 'fro')
print(f"重构相对误差: {reconstruction_error:.4f}")能量保留准则
选择保留足够能量(方差)的奇异值:
def svd_by_energy(A, energy_threshold=0.99):
"""
基于能量阈值选择奇异值个数
保留至少 energy_threshold 的能量
"""
U, s, Vt = svd(A, full_matrices=False)
# 计算累积能量比例
total_energy = np.sum(s**2)
cumulative_energy = np.cumsum(s**2) / total_energy
# 找到满足阈值的最小k
k = np.searchsorted(cumulative_energy, energy_threshold) + 1
return k, cumulative_energy[k-1]
k, retained = svd_by_energy(W, energy_threshold=0.99)
print(f"保留99%能量需要 k={k} 个奇异值")SVD与模型剪枝
权重剪枝的SVD视角
将SVD应用于权重矩阵可以同时实现:
- 秩减少:减少有效参数量
- 能量保留:最小化重构误差
- 结构化剪枝:自然分解为低秩结构
import torch
import torch.nn as nn
class SVDLinear(nn.Module):
"""
基于SVD分解的线性层
将 W = U Σ V^T 实现为两个线性层的组合:
y = (U @ Σ) @ (V^T @ x) = W1 @ (W2 @ x)
其中 W1 ∈ R^{out × k}, W2 ∈ R^{k × in}
"""
def __init__(self, in_features, out_features, rank_ratio=0.1):
super().__init__()
self.in_features = in_features
self.out_features = out_features
# 原始线性层
self.original = nn.Linear(in_features, out_features, bias=False)
# SVD分解
self._apply_svd(rank_ratio)
def _apply_svd(self, rank_ratio):
W = self.original.weight.data.numpy()
U, s, Vt = np.linalg.svd(W, full_matrices=False)
# 计算目标秩
max_rank = min(W.shape[0], W.shape[1])
k = max(1, int(max_rank * rank_ratio))
# 分解为两个矩阵
self.u = nn.Parameter(torch.tensor(U[:, :k] @ np.diag(s[:k]), dtype=torch.float32))
self.vt = nn.Parameter(torch.tensor(Vt[:k, :], dtype=torch.float32))
self.original.weight.requires_grad = False
self.original.weight.data = torch.zeros_like(self.original.weight.data)
print(f"SVD: {W.shape} -> ({self.u.shape}, {self.vt.shape})")
def forward(self, x):
# 等价于 W @ x,但使用低秩分解
return F.linear(x, self.vt) @ self.u.t()
def get_original_weights(self):
"""重构原始权重用于比较"""
return (self.u.data.numpy() @ self.vt.data.numpy())
class PrunedConv2d(nn.Module):
"""基于SVD的卷积层剪枝"""
def __init__(self, conv_layer, rank_ratio=0.5):
super().__init__()
self.in_channels = conv_layer.in_channels
self.out_channels = conv_layer.out_channels
self.kernel_size = conv_layer.kernel_size
# 将卷积核 reshape 为 (out, in × k × k)
kernel = conv_layer.weight.data
C_out, C_in, H, W = kernel.shape
kernel_reshaped = kernel.reshape(C_out, C_in * H * W)
# SVD分解
U, s, Vt = np.linalg.svd(kernel_reshaped.numpy(), full_matrices=False)
k = max(1, int(min(C_out, C_in * H * W) * rank_ratio))
# 分解为两个线性层
self.u = nn.Parameter(torch.tensor(U[:, :k] @ np.diag(s[:k]), dtype=torch.float32))
self.vt = nn.Parameter(torch.tensor(Vt[:k, :], dtype=torch.float32))
# 存储结构用于前向传播
self.C_in = C_in
self.H = H
self.W = W
def forward(self, x):
# 展开输入
B, C, H, W = x.shape
x_unfolded = F.unfold(x, (self.H, self.W)).transpose(1, 2)
# 应用低秩分解
x_proj = torch.matmul(x_unfolded, self.vt.t()) # (B, seq, k)
out = torch.matmul(x_proj, self.u.t()) # (B, seq, out)
# 重塑输出
out = out.transpose(1, 2)
H_out = H - self.H + 1
W_out = W - self.W + 1
return out.view(B, self.out_channels, H_out, W_out)LoRA: 低秩适配的SVD视角
LoRA(Low-Rank Adaptation)本质上是强制性的低秩分解,与SVD的思想高度一致。
LoRA原理
对于预训练权重 ,LoRA添加低秩适配:
其中 ,,。
SVD视角的LoRA分析
import torch
import torch.nn.functional as F
class LoRALinear(torch.nn.Module):
def __init__(self, in_features, out_features, rank=4, alpha=1):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.rank = rank
self.scale = alpha / rank
# 原始权重(冻结)
self.weight = torch.nn.Parameter(
torch.randn(out_features, in_features) * 0.01
)
self.weight.requires_grad = False
# LoRA低秩矩阵
self.lora_A = torch.nn.Parameter(torch.randn(rank, in_features) * 0.01)
self.lora_B = torch.nn.Parameter(torch.zeros(out_features, rank))
def forward(self, x):
# 原始前向
base_output = F.linear(x, self.weight)
# LoRA增量
lora_output = (self.lora_B @ self.lora_A) * self.scale
lora_output = F.linear(x, lora_output)
return base_output + lora_output
def apply_svd_initialization(self, W_delta):
"""
使用W的SVD分解来初始化LoRA矩阵
这使得初始适配方向与权重变化的主方向一致
"""
U, s, Vt = np.linalg.svd(W_delta, full_matrices=False)
# 取前rank个奇异向量
self.lora_B.data = torch.tensor(
U[:, :self.rank] @ np.diag(s[:self.rank]), dtype=torch.float32
)
self.lora_A.data = torch.tensor(
Vt[:self.rank, :], dtype=torch.float32
)
# 示例:对比随机初始化与SVD初始化
def compare_lora_initializations():
# 假设的权重变化
W_delta = np.random.randn(512, 768) * 0.1
# SVD分析
U, s, Vt = np.linalg.svd(W_delta, full_matrices=False)
print("权重变化的SVD分析:")
print(f" 奇异值: {s[:10]}") # 前10个奇异值
print(f" 能量分布: 前10个奇异值占 {100*np.sum(s[:10]**2)/np.sum(s**2):.1f}%")
# 低秩近似的误差
for r in [1, 4, 8, 16, 32]:
W_approx = U[:, :r] @ np.diag(s[:r]) @ Vt[:r, :]
error = np.linalg.norm(W_delta - W_approx, 'fro') / np.linalg.norm(W_delta, 'fro')
print(f" 秩{r}近似相对误差: {error:.4f}")
compare_lora_initializations()SVD与表示学习
表示的奇异值分析
神经网络的隐藏表示也可以通过SVD分析:
def analyze_representation_svd(activations, layer_name="hidden"):
"""
分析神经网络隐藏层的表示特性
activations: (batch_size, hidden_dim) 的激活矩阵
"""
# 中心化
activations_centered = activations - activations.mean(axis=0)
# SVD分解
U, s, Vt = np.linalg.svd(activations_centered, full_matrices=False)
# 计算奇异值谱
explained_variance = s**2 / (len(s) - 1)
explained_variance_ratio = explained_variance / explained_variance.sum()
# 有效秩(基于熵)
probs = explained_variance_ratio
entropy = -np.sum(probs * np.log(probs + 1e-10))
max_entropy = np.log(len(probs))
effective_rank = np.exp(entropy)
print(f"\n{layer_name} 层表示分析:")
print(f" 有效秩: {effective_rank:.2f} / {len(s)}")
print(f" 前5个奇异值: {s[:5]}")
print(f" 前5个解释方差比: {explained_variance_ratio[:5]}")
print(f" 累积解释方差 (10, 50, 100维): {np.cumsum(explained_variance_ratio)[[9,49,99]]}")
return {
'singular_values': s,
'explained_variance_ratio': explained_variance_ratio,
'effective_rank': effective_rank,
'U': U,
'Vt': Vt
}
# 示例:分析不同层表示的有效秩
import torch
def analyze_network_layers(model, data_loader):
"""分析网络各层表示的有效秩"""
activations = {}
handles = []
def hook_fn(name):
def hook(module, input, output):
if isinstance(output, tuple):
activations[name] = output[0].detach().cpu().numpy()
else:
activations[name] = output.detach().cpu().numpy()
return hook
# 注册hook
for name, module in model.named_modules():
if 'attention' in name or 'mlp' in name:
handles.append(module.register_forward_hook(hook_fn(name)))
# 前向传播
with torch.no_grad():
for batch_x, _ in data_loader:
_ = model(batch_x)
break # 只看第一个batch
# 分析
for name, act in activations.items():
analyze_representation_svd(act.reshape(len(act), -1), name)
# 清理hook
for h in handles:
h.remove()SVD与优化
权重空间的SVD分析
def analyze_optimizer_trajectory_svd(weight_history):
"""
分析优化器在权重空间中轨迹的SVD结构
weight_history: (n_steps, d_model) 的权重序列
"""
# 计算权重变化的差分
delta_W = np.diff(weight_history, axis=0) # (n_steps-1, d_model)
# SVD分解梯度方向
U, s, Vt = np.linalg.svd(delta_W, full_matrices=False)
print("优化轨迹的SVD分析:")
print(f" 主奇异值: {s[:5]}")
print(f" 有效方向数: {np.sum(s > 1e-6)}")
# 分析是否沿着低维子空间移动
explained_99 = np.searchsorted(np.cumsum(s**2) / np.sum(s**2), 0.99) + 1
print(f" 解释99%方差需要的维度: {explained_99} / {len(s)}")
return U, s, Vt
def spectral_evolution(model, train_loader, optimizer, n_steps=100):
"""追踪训练过程中权重谱的变化"""
spectral_history = {'sigma_max': [], 'sigma_sum': [], 'condition_number': []}
for step, (x, y) in enumerate(train_loader):
if step >= n_steps:
break
optimizer.zero_grad()
loss = model(x, y)
loss.backward()
optimizer.step()
# 分析权重谱
for name, param in model.named_parameters():
if 'weight' in name and param.dim() == 2:
s = np.linalg.svd(param.data.numpy(), compute_uv=False)
spectral_history['sigma_max'].append(s[0])
spectral_history['sigma_sum'].append(np.sum(s))
spectral_history['condition_number'].append(s[0] / s[-1])
return spectral_historySVD与神经网络理论
神经网络作为非线性SVD
def nnsvd_forward_pass(x, W_list, activation_fn):
"""
模拟神经网络前向传播的SVD视角
每层可视为:
1. 线性变换: z = Wx
2. 非线性激活: a = σ(z)
"""
h = x
svd_info = []
for i, W in enumerate(W_list):
# SVD分解
U, s, Vt = np.linalg.svd(W, full_matrices=False)
# 分析
info = {
'layer': i,
'rank': np.sum(s > 1e-10),
'sigma_max': s[0],
'condition_number': s[0] / (s[-1] if s[-1] > 1e-10 else 1),
'energy_ratio_top10': np.sum(s[:10]**2) / np.sum(s**2)
}
svd_info.append(info)
# 前向传播
h = activation_fn(W @ h)
return h, svd_info
# 示例:分析网络的有效容量
def analyze_network_capacity(W_list, input_dim):
"""分析网络的信息容量"""
capacity_info = []
# 输入空间奇异值
input_basis = np.eye(input_dim)
for i, W in enumerate(W_list):
# 分析每层如何变换输入空间
_, s, _ = np.linalg.svd(W, full_matrices=False)
info = {
'layer': i,
'output_rank': np.sum(s > 1e-10),
'singular_values': s[:10],
'intrinsic_dimension_99': np.searchsorted(
np.cumsum(s**2) / np.sum(s**2), 0.99
) + 1
}
capacity_info.append(info)
return capacity_info实践建议
1. 何时使用SVD压缩
def should_use_svd_compression(weight_matrix, compression_ratio=0.1):
"""
判断是否应该使用SVD压缩
基于奇异值谱的衰减特性决定
"""
s = np.linalg.svd(weight_matrix, compute_uv=False)
total_energy = np.sum(s**2)
# 计算不同压缩比下的重构误差
for r in [0.01, 0.05, 0.1, 0.2]:
k = max(1, int(len(s) * r))
reconstruction_error = np.sqrt(
np.sum(s[k:]**2) / total_energy
)
print(f"压缩到 {r*100:.0f}%: 重构误差 = {reconstruction_error:.4f}")
# 如果前10%奇异值就能解释90%能量,则适合SVD压缩
k_90 = np.searchsorted(np.cumsum(s**2) / total_energy, 0.9) + 1
return k_90 / len(s) < 0.22. 避免数值问题
def stable_svd(A, full_matrices=False):
"""
数值稳定的SVD实现
"""
# 处理病态矩阵
A = np.array(A, dtype=np.float64)
# 使用scipy的SVD(通常更稳定)
from scipy.linalg import svd as scipy_svd
U, s, Vt = scipy_svd(A, full_matrices=full_matrices)
# 处理接近零的奇异值
s = np.maximum(s, 1e-12)
return U, s, Vt3. 加速计算
def fast_truncated_svd(A, k):
"""
快速截断SVD(使用随机化算法)
适用于大矩阵,精度略有损失但速度快
"""
from sklearn.decomposition import TruncatedSVD
# 随机SVD
svd = TruncatedSVD(n_components=k, random_state=42)
return svd.fit_transform(A), svd.singular_values_, svd.components_参考
Footnotes
-
Golub, G. H., & Van Loan, C. F. (2013). Matrix computations. JHU Press. ↩