SVD在深度学习中的应用

奇异值分解(Singular Value Decomposition, SVD)是线性代数的核心工具,在深度学习中广泛应用于模型压缩、权重分析、表示学习和优化算法的设计。1


SVD基础回顾

定义

任意矩阵 可以分解为:

其中:

  • :左奇异向量矩阵(正交)
  • :奇异值矩阵(对角线元素为奇异值
  • :右奇异向量矩阵(正交)

关键性质

性质公式意义
谱范数$\A\
核范数$\A\
Frobenius范数$\A\
非零奇异值个数

基于SVD的模型压缩

截断SVD(Truncated SVD)

保留前 个最大的奇异值,获得矩阵的最佳 秩近似:

压缩效果:参数量从 减少到

import numpy as np
from scipy.linalg import svd
 
def truncated_svd(A, k):
    """
    截断SVD:保留前k个奇异值
    
    Args:
        A: 输入矩阵 (m, n)
        k: 保留的奇异值个数
    
    Returns:
        U_k, S_k, Vt_k, compression_ratio
    """
    U, s, Vt = svd(A, full_matrices=False)
    
    # 取前k个奇异值
    U_k = U[:, :k]
    S_k = s[:k]
    Vt_k = Vt[:k, :]
    
    # 计算压缩比
    original_params = A.shape[0] * A.shape[1]
    compressed_params = k * (A.shape[0] + A.shape[1])
    compression_ratio = original_params / compressed_params
    
    return U_k, S_k, Vt_k, compression_ratio
 
# 示例:压缩一个全连接层
# 假设权重矩阵 W: (4096, 4096),压缩到 k=64
W = np.random.randn(4096, 4096)
U_k, S_k, Vt_k, ratio = truncated_svd(W, k=64)
print(f"压缩比: {ratio:.1f}x")  # 约32x压缩
 
# 重构矩阵
W_reconstructed = U_k @ np.diag(S_k) @ Vt_k
reconstruction_error = np.linalg.norm(W - W_reconstructed, 'fro') / np.linalg.norm(W, 'fro')
print(f"重构相对误差: {reconstruction_error:.4f}")

能量保留准则

选择保留足够能量(方差)的奇异值:

def svd_by_energy(A, energy_threshold=0.99):
    """
    基于能量阈值选择奇异值个数
    
    保留至少 energy_threshold 的能量
    """
    U, s, Vt = svd(A, full_matrices=False)
    
    # 计算累积能量比例
    total_energy = np.sum(s**2)
    cumulative_energy = np.cumsum(s**2) / total_energy
    
    # 找到满足阈值的最小k
    k = np.searchsorted(cumulative_energy, energy_threshold) + 1
    
    return k, cumulative_energy[k-1]
 
k, retained = svd_by_energy(W, energy_threshold=0.99)
print(f"保留99%能量需要 k={k} 个奇异值")

SVD与模型剪枝

权重剪枝的SVD视角

将SVD应用于权重矩阵可以同时实现:

  1. 秩减少:减少有效参数量
  2. 能量保留:最小化重构误差
  3. 结构化剪枝:自然分解为低秩结构
import torch
import torch.nn as nn
 
class SVDLinear(nn.Module):
    """
    基于SVD分解的线性层
    
    将 W = U Σ V^T 实现为两个线性层的组合:
    y = (U @ Σ) @ (V^T @ x) = W1 @ (W2 @ x)
    
    其中 W1 ∈ R^{out × k}, W2 ∈ R^{k × in}
    """
    def __init__(self, in_features, out_features, rank_ratio=0.1):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        # 原始线性层
        self.original = nn.Linear(in_features, out_features, bias=False)
        
        # SVD分解
        self._apply_svd(rank_ratio)
    
    def _apply_svd(self, rank_ratio):
        W = self.original.weight.data.numpy()
        U, s, Vt = np.linalg.svd(W, full_matrices=False)
        
        # 计算目标秩
        max_rank = min(W.shape[0], W.shape[1])
        k = max(1, int(max_rank * rank_ratio))
        
        # 分解为两个矩阵
        self.u = nn.Parameter(torch.tensor(U[:, :k] @ np.diag(s[:k]), dtype=torch.float32))
        self.vt = nn.Parameter(torch.tensor(Vt[:k, :], dtype=torch.float32))
        
        self.original.weight.requires_grad = False
        self.original.weight.data = torch.zeros_like(self.original.weight.data)
        
        print(f"SVD: {W.shape} -> ({self.u.shape}, {self.vt.shape})")
    
    def forward(self, x):
        # 等价于 W @ x,但使用低秩分解
        return F.linear(x, self.vt) @ self.u.t()
    
    def get_original_weights(self):
        """重构原始权重用于比较"""
        return (self.u.data.numpy() @ self.vt.data.numpy())
 
class PrunedConv2d(nn.Module):
    """基于SVD的卷积层剪枝"""
    def __init__(self, conv_layer, rank_ratio=0.5):
        super().__init__()
        self.in_channels = conv_layer.in_channels
        self.out_channels = conv_layer.out_channels
        self.kernel_size = conv_layer.kernel_size
        
        # 将卷积核 reshape 为 (out, in × k × k)
        kernel = conv_layer.weight.data
        C_out, C_in, H, W = kernel.shape
        kernel_reshaped = kernel.reshape(C_out, C_in * H * W)
        
        # SVD分解
        U, s, Vt = np.linalg.svd(kernel_reshaped.numpy(), full_matrices=False)
        
        k = max(1, int(min(C_out, C_in * H * W) * rank_ratio))
        
        # 分解为两个线性层
        self.u = nn.Parameter(torch.tensor(U[:, :k] @ np.diag(s[:k]), dtype=torch.float32))
        self.vt = nn.Parameter(torch.tensor(Vt[:k, :], dtype=torch.float32))
        
        # 存储结构用于前向传播
        self.C_in = C_in
        self.H = H
        self.W = W
    
    def forward(self, x):
        # 展开输入
        B, C, H, W = x.shape
        x_unfolded = F.unfold(x, (self.H, self.W)).transpose(1, 2)
        
        # 应用低秩分解
        x_proj = torch.matmul(x_unfolded, self.vt.t())  # (B, seq, k)
        out = torch.matmul(x_proj, self.u.t())  # (B, seq, out)
        
        # 重塑输出
        out = out.transpose(1, 2)
        H_out = H - self.H + 1
        W_out = W - self.W + 1
        return out.view(B, self.out_channels, H_out, W_out)

LoRA: 低秩适配的SVD视角

LoRA(Low-Rank Adaptation)本质上是强制性的低秩分解,与SVD的思想高度一致。

LoRA原理

对于预训练权重 ,LoRA添加低秩适配:

其中

SVD视角的LoRA分析

import torch
import torch.nn.functional as F
 
class LoRALinear(torch.nn.Module):
    def __init__(self, in_features, out_features, rank=4, alpha=1):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.scale = alpha / rank
        
        # 原始权重(冻结)
        self.weight = torch.nn.Parameter(
            torch.randn(out_features, in_features) * 0.01
        )
        self.weight.requires_grad = False
        
        # LoRA低秩矩阵
        self.lora_A = torch.nn.Parameter(torch.randn(rank, in_features) * 0.01)
        self.lora_B = torch.nn.Parameter(torch.zeros(out_features, rank))
    
    def forward(self, x):
        # 原始前向
        base_output = F.linear(x, self.weight)
        # LoRA增量
        lora_output = (self.lora_B @ self.lora_A) * self.scale
        lora_output = F.linear(x, lora_output)
        return base_output + lora_output
    
    def apply_svd_initialization(self, W_delta):
        """
        使用W的SVD分解来初始化LoRA矩阵
        
        这使得初始适配方向与权重变化的主方向一致
        """
        U, s, Vt = np.linalg.svd(W_delta, full_matrices=False)
        
        # 取前rank个奇异向量
        self.lora_B.data = torch.tensor(
            U[:, :self.rank] @ np.diag(s[:self.rank]), dtype=torch.float32
        )
        self.lora_A.data = torch.tensor(
            Vt[:self.rank, :], dtype=torch.float32
        )
 
# 示例:对比随机初始化与SVD初始化
def compare_lora_initializations():
    # 假设的权重变化
    W_delta = np.random.randn(512, 768) * 0.1
    
    # SVD分析
    U, s, Vt = np.linalg.svd(W_delta, full_matrices=False)
    
    print("权重变化的SVD分析:")
    print(f"  奇异值: {s[:10]}")  # 前10个奇异值
    print(f"  能量分布: 前10个奇异值占 {100*np.sum(s[:10]**2)/np.sum(s**2):.1f}%")
    
    # 低秩近似的误差
    for r in [1, 4, 8, 16, 32]:
        W_approx = U[:, :r] @ np.diag(s[:r]) @ Vt[:r, :]
        error = np.linalg.norm(W_delta - W_approx, 'fro') / np.linalg.norm(W_delta, 'fro')
        print(f"  秩{r}近似相对误差: {error:.4f}")
 
compare_lora_initializations()

SVD与表示学习

表示的奇异值分析

神经网络的隐藏表示也可以通过SVD分析:

def analyze_representation_svd(activations, layer_name="hidden"):
    """
    分析神经网络隐藏层的表示特性
    
    activations: (batch_size, hidden_dim) 的激活矩阵
    """
    # 中心化
    activations_centered = activations - activations.mean(axis=0)
    
    # SVD分解
    U, s, Vt = np.linalg.svd(activations_centered, full_matrices=False)
    
    # 计算奇异值谱
    explained_variance = s**2 / (len(s) - 1)
    explained_variance_ratio = explained_variance / explained_variance.sum()
    
    # 有效秩(基于熵)
    probs = explained_variance_ratio
    entropy = -np.sum(probs * np.log(probs + 1e-10))
    max_entropy = np.log(len(probs))
    effective_rank = np.exp(entropy)
    
    print(f"\n{layer_name} 层表示分析:")
    print(f"  有效秩: {effective_rank:.2f} / {len(s)}")
    print(f"  前5个奇异值: {s[:5]}")
    print(f"  前5个解释方差比: {explained_variance_ratio[:5]}")
    print(f"  累积解释方差 (10, 50, 100维): {np.cumsum(explained_variance_ratio)[[9,49,99]]}")
    
    return {
        'singular_values': s,
        'explained_variance_ratio': explained_variance_ratio,
        'effective_rank': effective_rank,
        'U': U,
        'Vt': Vt
    }
 
# 示例:分析不同层表示的有效秩
import torch
 
def analyze_network_layers(model, data_loader):
    """分析网络各层表示的有效秩"""
    activations = {}
    handles = []
    
    def hook_fn(name):
        def hook(module, input, output):
            if isinstance(output, tuple):
                activations[name] = output[0].detach().cpu().numpy()
            else:
                activations[name] = output.detach().cpu().numpy()
        return hook
    
    # 注册hook
    for name, module in model.named_modules():
        if 'attention' in name or 'mlp' in name:
            handles.append(module.register_forward_hook(hook_fn(name)))
    
    # 前向传播
    with torch.no_grad():
        for batch_x, _ in data_loader:
            _ = model(batch_x)
            break  # 只看第一个batch
    
    # 分析
    for name, act in activations.items():
        analyze_representation_svd(act.reshape(len(act), -1), name)
    
    # 清理hook
    for h in handles:
        h.remove()

SVD与优化

权重空间的SVD分析

def analyze_optimizer_trajectory_svd(weight_history):
    """
    分析优化器在权重空间中轨迹的SVD结构
    
    weight_history: (n_steps, d_model) 的权重序列
    """
    # 计算权重变化的差分
    delta_W = np.diff(weight_history, axis=0)  # (n_steps-1, d_model)
    
    # SVD分解梯度方向
    U, s, Vt = np.linalg.svd(delta_W, full_matrices=False)
    
    print("优化轨迹的SVD分析:")
    print(f"  主奇异值: {s[:5]}")
    print(f"  有效方向数: {np.sum(s > 1e-6)}")
    
    # 分析是否沿着低维子空间移动
    explained_99 = np.searchsorted(np.cumsum(s**2) / np.sum(s**2), 0.99) + 1
    print(f"  解释99%方差需要的维度: {explained_99} / {len(s)}")
    
    return U, s, Vt
 
def spectral_evolution(model, train_loader, optimizer, n_steps=100):
    """追踪训练过程中权重谱的变化"""
    spectral_history = {'sigma_max': [], 'sigma_sum': [], 'condition_number': []}
    
    for step, (x, y) in enumerate(train_loader):
        if step >= n_steps:
            break
        
        optimizer.zero_grad()
        loss = model(x, y)
        loss.backward()
        optimizer.step()
        
        # 分析权重谱
        for name, param in model.named_parameters():
            if 'weight' in name and param.dim() == 2:
                s = np.linalg.svd(param.data.numpy(), compute_uv=False)
                spectral_history['sigma_max'].append(s[0])
                spectral_history['sigma_sum'].append(np.sum(s))
                spectral_history['condition_number'].append(s[0] / s[-1])
    
    return spectral_history

SVD与神经网络理论

神经网络作为非线性SVD

def nnsvd_forward_pass(x, W_list, activation_fn):
    """
    模拟神经网络前向传播的SVD视角
    
    每层可视为:
    1. 线性变换: z = Wx
    2. 非线性激活: a = σ(z)
    """
    h = x
    svd_info = []
    
    for i, W in enumerate(W_list):
        # SVD分解
        U, s, Vt = np.linalg.svd(W, full_matrices=False)
        
        # 分析
        info = {
            'layer': i,
            'rank': np.sum(s > 1e-10),
            'sigma_max': s[0],
            'condition_number': s[0] / (s[-1] if s[-1] > 1e-10 else 1),
            'energy_ratio_top10': np.sum(s[:10]**2) / np.sum(s**2)
        }
        svd_info.append(info)
        
        # 前向传播
        h = activation_fn(W @ h)
    
    return h, svd_info
 
# 示例:分析网络的有效容量
def analyze_network_capacity(W_list, input_dim):
    """分析网络的信息容量"""
    capacity_info = []
    
    # 输入空间奇异值
    input_basis = np.eye(input_dim)
    
    for i, W in enumerate(W_list):
        # 分析每层如何变换输入空间
        _, s, _ = np.linalg.svd(W, full_matrices=False)
        
        info = {
            'layer': i,
            'output_rank': np.sum(s > 1e-10),
            'singular_values': s[:10],
            'intrinsic_dimension_99': np.searchsorted(
                np.cumsum(s**2) / np.sum(s**2), 0.99
            ) + 1
        }
        capacity_info.append(info)
    
    return capacity_info

实践建议

1. 何时使用SVD压缩

def should_use_svd_compression(weight_matrix, compression_ratio=0.1):
    """
    判断是否应该使用SVD压缩
    
    基于奇异值谱的衰减特性决定
    """
    s = np.linalg.svd(weight_matrix, compute_uv=False)
    total_energy = np.sum(s**2)
    
    # 计算不同压缩比下的重构误差
    for r in [0.01, 0.05, 0.1, 0.2]:
        k = max(1, int(len(s) * r))
        reconstruction_error = np.sqrt(
            np.sum(s[k:]**2) / total_energy
        )
        print(f"压缩到 {r*100:.0f}%: 重构误差 = {reconstruction_error:.4f}")
    
    # 如果前10%奇异值就能解释90%能量,则适合SVD压缩
    k_90 = np.searchsorted(np.cumsum(s**2) / total_energy, 0.9) + 1
    return k_90 / len(s) < 0.2

2. 避免数值问题

def stable_svd(A, full_matrices=False):
    """
    数值稳定的SVD实现
    """
    # 处理病态矩阵
    A = np.array(A, dtype=np.float64)
    
    # 使用scipy的SVD(通常更稳定)
    from scipy.linalg import svd as scipy_svd
    U, s, Vt = scipy_svd(A, full_matrices=full_matrices)
    
    # 处理接近零的奇异值
    s = np.maximum(s, 1e-12)
    
    return U, s, Vt

3. 加速计算

def fast_truncated_svd(A, k):
    """
    快速截断SVD(使用随机化算法)
    
    适用于大矩阵,精度略有损失但速度快
    """
    from sklearn.decomposition import TruncatedSVD
    
    # 随机SVD
    svd = TruncatedSVD(n_components=k, random_state=42)
    return svd.fit_transform(A), svd.singular_values_, svd.components_

参考

Footnotes

  1. Golub, G. H., & Van Loan, C. F. (2013). Matrix computations. JHU Press.