深度学习中的矩阵微积分
引言
深度学习的核心是反向传播算法(Backpropagation),而反向传播本质上就是矩阵微积分的应用。理解链式法则的矩阵形式对于深入理解神经网络训练至关重要。
核心挑战:神经网络中涉及大量矩阵和向量运算,传统标量微积分的规则需要扩展到矩阵和向量形式。
1. 矩阵微积分基础
1.1 布局约定
矩阵微积分有两种主要布局约定:
| 约定 | 梯度形状 | 分子布局 | 分母布局 |
|---|---|---|---|
| 分子布局 (Numerator) | (列向量) | 是标量 | 是向量 |
| 分母布局 (Denominator) | (Jacobian) | 是向量 | 是向量 |
PyTorch使用分母布局(行优先的Jacobian)。
1.2 基本求导规则
向量对向量求导:
def numerical_jacobian(f, x, eps=1e-5):
"""
数值计算Jacobian矩阵(验证用)
f: 函数 y = f(x)
x: 输入向量
"""
y = f(x)
n = x.shape[0]
m = y.shape[0]
J = torch.zeros(m, n)
for j in range(n):
x_plus = x.clone()
x_minus = x.clone()
x_plus[j] += eps
x_minus[j] -= eps
J[:, j] = (f(x_plus) - f(x_minus)) / (2 * eps)
return J2. 链式法则的矩阵形式
2.1 标量链式法则回顾
对于标量:
2.2 向量链式法则
情境1:
def chain_rule_example():
"""
示例:链式法则的数值验证
"""
# 定义函数
def g(x):
# y = W @ x + b
W = torch.tensor([[2.0, 1.0], [1.0, 3.0]])
b = torch.tensor([1.0, 2.0])
return W @ x + b
def f(y):
# z = sum(y^2)
return (y ** 2).sum()
# 输入
x = torch.tensor([1.0, 2.0], requires_grad=True)
# 前向
y = g(x)
z = f(y)
# 反向
z.backward()
# 数值验证
eps = 1e-5
numerical_grad = torch.zeros(2)
for i in range(2):
x_plus = x.clone()
x_plus[i] += eps
x_minus = x.clone()
x_minus[i] -= eps
numerical_grad[i] = (f(g(x_plus)) - f(g(x_minus))) / (2 * eps)
print(f"Analytical gradient: {x.grad}")
print(f"Numerical gradient: {numerical_grad}")
print(f"Match: {torch.allclose(x.grad, numerical_grad, atol=1e-4)}")2.3 矩阵乘法的Jacobian
关键公式:对于
具体形式:
def matrix_multiplication_gradients():
"""
矩阵乘法的梯度计算
"""
# 设置
W = torch.randn(3, 4, requires_grad=True)
X = torch.randn(4, 5, requires_grad=True)
# 前向
Y = W @ X
# 假设上游梯度
grad_output = torch.randn(3, 5)
Y.backward(grad_output)
# 解析梯度
# grad_W = grad_output @ X^T
expected_grad_W = grad_output @ X.T
# grad_X = W^T @ grad_output
expected_grad_X = W.T @ grad_output
print(f"W grad match: {torch.allclose(W.grad, expected_grad_W)}")
print(f"X grad match: {torch.allclose(X.grad, expected_grad_X)}")3. 逐元素运算的Jacobian
3.1 ReLU
def relu_jacobian(x):
"""
ReLU的Jacobian矩阵是对角矩阵
"""
mask = (x > 0).float()
return torch.diag(mask)
def relu_backward(x, grad_output):
"""
ReLU的反向传播
"""
# Jacobian-向量乘积
mask = (x > 0).float()
return grad_output * mask3.2 Softmax
def softmax_jacobian(x):
"""
Softmax的Jacobian矩阵
J[i,j] = softmax(x)_i * (delta[i,j] - softmax(x)_j)
"""
s = F.softmax(x, dim=-1)
# 使用 einsum 简洁计算
# J[i,j] = s[i] * (delta[i,j] - s[j])
return s.unsqueeze(-1) * (torch.eye(x.shape[-1], device=x.device) - s.unsqueeze(-2))
def softmax_backward(x, grad_output):
"""
Softmax的反向传播(更高效的实现)
"""
s = F.softmax(x, dim=-1)
# 方法1:完整Jacobian
# J = s.unsqueeze(-1) * (torch.eye(len(s)) - s.unsqueeze(-2))
# return grad_output @ J
# 方法2:简化计算
# 利用 softmax 的特殊结构
sum_grad = (s * grad_output).sum(dim=-1, keepdim=True)
return s * (grad_output - sum_grad)3.3 LayerNorm
def layernorm_backward(x, y, gamma, beta, grad_output, eps=1e-5):
"""
LayerNorm的反向传播
y = gamma * (x - mean) / sqrt(var + eps) + beta
"""
N = x.shape[-1]
# 前向计算中的中间变量
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True, unbiased=False)
std = torch.sqrt(var + eps)
x_norm = (x - mean) / std
# 逐元素反向
dx_norm = grad_output * gamma
# var 路径
dvar = (dx_norm * (x - mean) * (-0.5) * std**(-3)).sum(dim=-1, keepdim=True)
# mean 路径
dmean = (-dx_norm / std).sum(dim=-1, keepdim=True)
dmean += dvar * (-2.0 / N) * (x - mean).sum(dim=-1, keepdim=True) / N
# x 路径
dx = dx_norm / std
dx += dvar * 2.0 * (x - mean) / N
dx += dmean / N
# gamma 和 beta 梯度
dgamma = (grad_output * x_norm).sum(dim=-1, keepdim=True)
dbeta = grad_output.sum(dim=-1, keepdim=True)
return dx, dgamma, dbeta4. 自动微分框架
4.1 PyTorch的反向模式自动微分
def pytorch_autograd_demo():
"""
PyTorch自动微分演示
"""
# 创建张量,启用梯度追踪
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
# 定义计算图
y = x ** 2 # y = x^2
z = y.sum() # z = sum(y)
# 反向传播
z.backward()
# dz/dx = 2x
print(f"Gradient: {x.grad}") # [2., 4., 6.]
# 验证
assert torch.allclose(x.grad, 2 * x)
class MyFunction(torch.autograd.Function):
"""
自定义可微函数
"""
@staticmethod
def forward(ctx, input):
# 保存反向传播需要的中间变量
ctx.save_for_backward(input)
return input ** 2
@staticmethod
def backward(ctx, grad_output):
# 获取保存的变量
input, = ctx.saved_tensors
# 返回梯度
return grad_output * 2 * input
def custom_function_demo():
"""
自定义函数演示
"""
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
# 使用自定义函数
y = MyFunction.apply(x)
z = y.sum()
z.backward()
print(f"Gradient: {x.grad}") # [2., 4., 6.]4.2 Jacobian与梯度
def jacobian_vs_gradient():
"""
Jacobian vs 梯度
"""
# 单输出:梯度
x = torch.randn(3, requires_grad=True)
y = (x ** 2).sum() # 标量
y.backward()
print(f"Gradient (scalar output): {x.grad}") # 向量
# 多输出:Jacobian
x = torch.randn(3, requires_grad=True)
y = x ** 2 # 向量
# 方法1:逐个计算
jacobian = torch.zeros(3, 3)
for i in range(3):
x_copy = x.clone()
y[i].backward(retain_graph=True)
jacobian[i] = x_copy.grad
x.grad.zero_()
print(f"Jacobian (vector output):\n{jacobian}") # 对角矩阵
# 方法2:使用 autograd.functional.jacobian
from torch.autograd.functional import jacobian
jacobian2 = jacobian(lambda x: x ** 2, x)
print(f"Jacobian (functional):\n{jacobian2}")5. 梯度流理论
5.1 连续时间梯度下降
离散梯度下降:
连续极限():
这是梯度流(Gradient Flow)的定义。
5.2 梯度范数演化
def gradient_flow_analysis(model, dataloader):
"""
分析梯度流的演化
"""
history = {
'gradient_norm': [],
'loss': [],
'layer_grad_norms': defaultdict(list)
}
for batch in dataloader:
# 前向
loss = compute_loss(model, batch)
# 反向
loss.backward()
# 记录
total_norm = 0
for name, p in model.named_parameters():
if p.grad is not None:
param_norm = p.grad.data.norm(2)
total_norm += param_norm.item() ** 2
history['layer_grad_norms'][name].append(param_norm.item())
total_norm = total_norm ** 0.5
history['gradient_norm'].append(total_norm)
history['loss'].append(loss.item())
# 优化器步
optimizer.step()
optimizer.zero_grad()
return history5.3 梯度消失与爆炸
def diagnose_gradient_issues(model):
"""
诊断梯度消失/爆炸问题
"""
issues = {
'vanishing': [],
'exploding': [],
'healthy': []
}
for name, p in model.named_parameters():
if p.grad is None:
continue
grad_norm = p.grad.data.norm(2).item()
if grad_norm < 1e-7:
issues['vanishing'].append({
'layer': name,
'norm': grad_norm
})
elif grad_norm > 100:
issues['exploding'].append({
'layer': name,
'norm': grad_norm
})
else:
issues['healthy'].append({
'layer': name,
'norm': grad_norm
})
return issues6. 二阶导数与曲率
6.1 Hessian矩阵计算
def compute_hessian_vector_product(model, dataloader, v):
"""
计算 Hessian-向量乘积
H @ v = grad(grad(L)^T @ v)
"""
model.zero_grad()
# 计算损失
loss = compute_loss(model, dataloader)
# 第一次反向传播
grads = torch.cat([p.grad.flatten() for p in model.parameters() if p.grad is not None])
# 计算 grad @ v
grad_dot_v = torch.dot(grads, v)
# 第二次反向传播
grad_dot_v.backward()
# 收集 Hessian @ v
hvp = torch.cat([p.grad.flatten() for p in model.parameters() if p.grad is not None])
return hvp
def power_iteration_hessian(model, dataloader, n_iter=10):
"""
使用幂迭代法计算Hessian的主特征向量
"""
# 随机初始化
v = torch.randn(sum(p.numel() for p in model.parameters()))
v = v / v.norm()
for _ in range(n_iter):
# Hessian @ v
Hv = compute_hessian_vector_product(model, dataloader, v)
# 正交化(可选)
Hv = Hv - v * torch.dot(v, Hv)
# 归一化
v = Hv / Hv.norm()
# 计算特征值
Hv = compute_hessian_vector_product(model, dataloader, v)
eigenvalue = torch.dot(v, Hv).item()
return v, eigenvalue6.2 曲率与优化
def analyze_curvature(model, dataloader, direction):
"""
分析损失在给定方向上的曲率
"""
# 计算Hessian @ direction
Hd = compute_hessian_vector_product(model, dataloader, direction)
# 曲率 = direction^T @ H @ direction
curvature = torch.dot(direction, Hd).item()
return {
'curvature': curvature,
'direction_norm': direction.norm().item(),
'Hd_norm': Hd.norm().item()
}7. 实践:自定义梯度
7.1 梯度裁剪
class ClippedReLU(torch.autograd.Function):
"""
带梯度裁剪的ReLU
"""
@staticmethod
def forward(ctx, input, max_val=1.0):
ctx.save_for_backward(input)
ctx.max_val = max_val
return input.clamp(min=0, max=max_val)
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
max_val = ctx.max_val
# 梯度裁剪
grad_input = grad_output.clone()
grad_input[input < 0] = 0
grad_input[input > max_val] = 0
return grad_input, None
def clipped_relu_demo():
"""
演示带梯度裁剪的ReLU
"""
x = torch.tensor([-1.0, 0.5, 1.5, 2.0], requires_grad=True)
y = ClippedReLU.apply(x, 1.0)
z = y.sum()
z.backward()
print(f"Input: {x}")
print(f"Output: {y}")
print(f"Gradient: {x.grad}") # [0, 0.5, 1.0, 0]7.2 禁止某些路径的梯度
def stop_gradient():
"""
停止梯度的几种方法
"""
# 方法1:detach()
x = torch.randn(3, requires_grad=True)
y = x ** 2
z = y.detach() # z 不再追踪梯度
# 方法2:detach_()
x = torch.randn(3, requires_grad=True)
y = x ** 2
y_detached = y.detach_() # 就地操作
# 方法3:with torch.no_grad()
x = torch.randn(3, requires_grad=True)
with torch.no_grad():
y = x ** 2 # 不追踪梯度8. 总结
核心要点
- 矩阵微积分扩展了标量微积分,处理向量和矩阵的导数
- 链式法则是反向传播的核心,需要正确追踪Jacobian
- 自动微分框架(如PyTorch)自动计算导数
- 梯度流是理解离散梯度下降连续极限的工具
- 二阶导数(Hessian)反映损失函数的曲率
关键公式
链式法则(向量形式):
矩阵乘法梯度:
梯度流:
参考资料
相关链接: