高斯过程基础
高斯过程(Gaussian Process, GP)是定义在连续域上的随机过程,其任意有限个点的联合分布为多元高斯分布。GP为函数提供了贝叶斯先验,是核方法与贝叶斯推断的统一框架。1
定义与基本概念
高斯过程的定义
高斯过程 是定义在连续输入空间 上的随机过程,满足:
其中:
- :均值函数
- :协方差函数(核函数)
对于任意有限点集 ,函数值 的联合分布为:
其中 ,。
import numpy as np
import matplotlib.pyplot as plt
class GaussianProcess:
"""高斯过程回归实现"""
def __init__(self, kernel, noise_std=0.1):
self.kernel = kernel
self.noise_std = noise_std
self.X_train = None
self.y_train = None
self.K = None
self.K_inv = None
def fit(self, X, y):
"""训练 GP(计算协方差矩阵的逆)"""
self.X_train = np.array(X)
self.y_train = np.array(y)
# 计算协方差矩阵 K(X, X)
self.K = self.kernel(self.X_train, self.X_train)
# 添加观测噪声
self.K += self.noise_std**2 * np.eye(len(X))
# 计算逆矩阵(使用 Cholesky 分解以提高数值稳定性)
try:
self.L = np.linalg.cholesky(self.K)
self.K_inv = np.linalg.inv(self.K)
except np.linalg.LinAlgError:
# 如果病态,添加小量正则化
self.K += 1e-6 * np.eye(len(X))
self.K_inv = np.linalg.inv(self.K)
def predict(self, X_star):
"""
预测新输入点的后验均值和方差
Returns:
mean: 预测均值
var: 预测方差
"""
X_star = np.array(X_star)
# 计算训练点与新点之间的协方差
K_star = self.kernel(self.X_train, X_star) # (n_train, n_star)
K_star_star = self.kernel(X_star, X_star) # (n_star, n_star)
# 后验均值: μ_* = K_*^T K^{-1} y
mean = K_star.T @ self.K_inv @ self.y_train
# 后验方差: σ²_* = K_{**} - K_*^T K^{-1} K_*
var = np.diag(K_star_star - K_star.T @ self.K_inv @ K_star)
var = np.maximum(var, 0) # 确保非负
return mean, var
def sample(self, X, n_samples=1):
"""从 GP 先验/后验采样"""
mean = np.zeros(len(X))
cov = self.kernel(X, X)
# 添加观测噪声
cov += 1e-6 * np.eye(len(X))
return np.random.multivariate_normal(mean, cov, n_samples)
# 定义核函数
def rbf_kernel(X1, X2, lengthscale=1.0, variance=1.0):
"""
RBF (径向基函数) 核 / 高斯核
k(x, x') = σ² exp(-‖x - x'‖² / (2ℓ²))
"""
X1 = np.array(X1)
X2 = np.array(X2)
# 计算欧氏距离的平方
dist_sq = (
X1[:, np.newaxis, :]**2 +
X2[np.newaxis, :, :]**2 -
2 * X1[:, np.newaxis, :] @ X2[np.newaxis, :, :].transpose(0, 2, 1)
)
return variance * np.exp(-dist_sq / (2 * lengthscale**2))核函数(协方差函数)
核函数决定了 GP 先验中函数的性质,是 GP 的核心设计选择。
常用核函数
| 核函数 | 公式 | 特性 |
|---|---|---|
| RBF/Squared Exponential | $\sigma^2 \exp(-\frac{\ | x-x’\ |
| Matérn 3/2 | 中等粗糙度 | |
| Matérn 5/2 | 更平滑 | |
| Periodic | 周期性模式 | |
| Linear | 线性函数 | |
| Constant | 常数偏移 |
class Kernels:
"""常用核函数集合"""
@staticmethod
def rbf(X1, X2, lengthscale=1.0, variance=1.0):
"""RBF 核"""
X1 = np.atleast_2d(X1)
X2 = np.atleast_2d(X2)
# pairwise distances
dist_sq = np.sum((X1[:, np.newaxis] - X2[np.newaxis, :])**2, axis=-1)
return variance * np.exp(-0.5 * dist_sq / lengthscale**2)
@staticmethod
def mattern32(X1, X2, lengthscale=1.0, variance=1.0):
"""Matérn 3/2 核"""
X1 = np.atleast_2d(X1)
X2 = np.atleast_2d(X2)
d = np.sqrt(np.sum((X1[:, np.newaxis] - X2[np.newaxis, :])**2, axis=-1))
sqrt_3 = np.sqrt(3)
return variance * (1 + sqrt_3 * d / lengthscale) * np.exp(-sqrt_3 * d / lengthscale)
@staticmethod
def mattern52(X1, X2, lengthscale=1.0, variance=1.0):
"""Matérn 5/2 核"""
X1 = np.atleast_2d(X1)
X2 = np.atleast_2d(X2)
d = np.sqrt(np.sum((X1[:, np.newaxis] - X2[np.newaxis, :])**2, axis=-1))
sqrt_5 = np.sqrt(5)
return variance * (
1 + sqrt_5 * d / lengthscale + 5 * d**2 / (3 * lengthscale**2)
) * np.exp(-sqrt_5 * d / lengthscale)
@staticmethod
def periodic(X1, X2, lengthscale=1.0, variance=1.0, period=1.0):
"""周期核"""
X1 = np.atleast_2d(X1)
X2 = np.atleast_2d(X2)
d = np.minimum(
np.abs(X1[:, np.newaxis] - X2[np.newaxis, :]),
2*np.pi - np.abs(X1[:, np.newaxis] - X2[np.newaxis, :])
)
return variance * np.exp(-2 * np.sin(d * np.pi / period)**2 / lengthscale**2)
@staticmethod
def linear(X1, X2, variance=1.0, c=0.0):
"""线性核"""
X1 = np.atleast_2d(X1)
X2 = np.atleast_2d(X2)
return variance * (X1 - c) @ (X2 - c).T
@staticmethod
def composite(ker1, ker2, op='add'):
"""核函数组合"""
def combined(X1, X2, **kwargs):
if op == 'add':
return ker1(X1, X2, **kwargs) + ker2(X1, X2, **kwargs)
elif op == 'mult':
return ker1(X1, X2, **kwargs) * ker2(X1, X2, **kwargs)
return combined核函数的组合
核函数可以通过加法和乘法组合:
def combined_kernel(X1, X2):
"""组合核:RBF + Linear + Periodic"""
k_rbf = Kernels.rbf(X1, X2, lengthscale=0.5)
k_lin = Kernels.linear(X1, X2, variance=0.1, c=0)
k_per = Kernels.periodic(X1, X2, lengthscale=1.0, period=2*np.pi)
return k_rbf + k_lin + k_perGP 回归与推断
观测模型
设观测 ,其中 。
后验预测
给定训练数据 ,新点 的预测分布为:
其中:
def gp_regression(X_train, y_train, X_test, kernel, noise_std=0.1):
"""
高斯过程回归
Args:
X_train: 训练输入 (n, d)
y_train: 训练输出 (n,)
X_test: 测试输入 (m, d)
kernel: 核函数
noise_std: 观测噪声标准差
Returns:
mean: 预测均值 (m,)
var: 预测方差 (m,)
"""
n = len(X_train)
m = len(X_test)
# 计算协方差矩阵
K = kernel(X_train, X_train) + noise_std**2 * np.eye(n)
K_star = kernel(X_train, X_test)
K_star_star = kernel(X_test, X_test)
# Cholesky 分解(更数值稳定)
L = np.linalg.cholesky(K)
# 后验均值
alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
mean = K_star.T @ alpha
# 后验方差(对角线)
v = np.linalg.solve(L, K_star)
var = np.diag(K_star_star - v.T @ v)
var = np.maximum(var, 1e-10)
return mean, var, np.sqrt(var)
# 示例:使用 GP 回归拟合函数
def demo_gp_regression():
# 真实函数
def true_function(x):
return np.sin(x * 3) + 0.5 * np.sin(x * 7)
# 训练数据
np.random.seed(42)
X_train = np.linspace(0, 2*np.pi, 20)[:, np.newaxis]
y_train = true_function(X_train) + np.random.normal(0, 0.1, X_train.shape)
# 测试点
X_test = np.linspace(0, 2*np.pi, 200)[:, np.newaxis]
# GP 回归
mean, var = None, None
def k(X1, X2):
return Kernels.rbf(X1, X2, lengthscale=0.5) + Kernels.mattern32(X1, X2, lengthscale=1.0)
mean, var, std = gp_regression(X_train, y_train, X_test, k, noise_std=0.1)
print(f"预测范围: [{mean.min():.3f}, {mean.max():.3f}]")
print(f"预测不确定性: [{std.min():.3f}, {std.max():.3f}]")
return X_train, y_train, X_test, mean, stdGP 的计算复杂度
问题
直接计算 GP 的协方差矩阵求逆需要 时间,其中 是训练样本数。这限制了 GP 在大规模数据集上的应用。
近似方法
class SparseGP:
"""
稀疏高斯过程(使用诱导点)
将计算复杂度从 O(n^3) 降低到 O(m^2 n + m^3),其中 m 是诱导点数量
"""
def __init__(self, kernel, n_inducing=50):
self.kernel = kernel
self.n_inducing = n_inducing
self.inducing_inputs = None
self.inducing_mean = None
self.inducing_cov = None
def fit(self, X, y):
"""选择诱导点并拟合"""
self.inducing_inputs = self._select_inducing_points(X)
# 计算诱导点的协方差
K_uu = self.kernel(self.inducing_inputs, self.inducing_inputs)
K_uf = self.kernel(self.inducing_inputs, X)
# 变分参数
self.q_mean = np.zeros(self.n_inducing)
self.q_cov = np.eye(self.n_inducing)
def predict(self, X_star):
"""预测新点"""
K_uu = self.kernel(self.inducing_inputs, self.inducing_inputs)
K_su = self.kernel(X_star, self.inducing_inputs)
# 预测均值
K_uu_inv = np.linalg.inv(K_uu)
mean = K_su @ K_uu_inv @ self.q_mean
return mean
def _select_inducing_points(self, X):
"""选择诱导点(使用随机采样或 k-means)"""
indices = np.random.choice(len(X), self.n_inducing, replace=False)
return X[indices]其他近似方法
| 方法 | 复杂度 | 原理 |
|---|---|---|
| Nyström 近似 | 使用 个诱导点近似核矩阵 | |
| 随机特征逼近 | 将核函数转化为随机傅里叶特征 | |
| 稀疏诱导点 | 变分推断,边缘化诱导变量 | |
| 分布式 GP | 数据分片,并行计算 |
class RandomFourierFeatures:
"""
随机傅里叶特征 (Random Fourier Features)
将 RBF 核近似为有限维随机投影的内积
"""
def __init__(self, kernel, n_features=100, lengthscale=1.0):
self.kernel = kernel
self.n_features = n_features
self.lengthscale = lengthscale
self.W = None
self.b = None
def fit(self, X):
"""采样随机权重"""
d = X.shape[1] if len(X.shape) > 1 else 1
# 采样频率
self.W = np.random.normal(0, 1/lengthscale, (d, self.n_features // 2))
self.b = np.random.uniform(0, 2*np.pi, self.n_features // 2)
def transform(self, X):
"""转换为随机特征"""
X = np.atleast_2d(X)
projection = X @ self.W + self.b
# 三角特征
z = np.concatenate([np.cos(projection), np.sin(projection)], axis=1)
# 归一化
return np.sqrt(2 / self.n_features) * zGP 与神经网络的联系
无限宽神经网络 = GP
深度学习理论的一个重要发现:具有随机初始化的无限宽神经网络等价于 GP。
def neural_tangent_kernel_gp():
"""
神经正切核 (NTK) 与 GP 的等价性
对于无限宽的网络:
- 预测均值 = 网络输出
- 预测方差 = NTK 的函数空间
"""
# NTK 的核函数定义
def ntk_kernel(x1, x2, depth=2, activation='relu'):
"""
计算 NTK 核
无限宽网络输出服从 GP,协方差由 NTK 给出
"""
if activation == 'relu':
def phi(z):
return np.maximum(z, 0)
def phi_prime(z):
return (z > 0).astype(float)
# 第一层贡献
K = x1 @ x2.T
return K # 简化版本GP 作为贝叶斯后验
GP 可以看作贝叶斯线性回归在无限维函数空间中的推广:
| 组件 | 有限维(线性回归) | 无限维(GP) |
|---|---|---|
| 参数 | 函数 | |
| 先验 | ||
| 似然 | $p(y | x,w) = \mathcal{N}(w^Tx, \sigma_n^2)$ |
| 后验 | $p(w | y) = \mathcal{N}(\mu, \Sigma)$ |
GP 的应用
1. 超参数优化(贝叶斯优化)
class BayesianOptimization:
"""
基于 GP 的贝叶斯优化
"""
def __init__(self, kernel, n_restarts=5):
self.gp = GaussianProcess(kernel)
self.n_restarts = n_restarts
self.X_observed = []
self.y_observed = []
def _acquisition_expected_improvement(self, X, xi=0.01):
"""期望改进 (Expected Improvement) 获取函数"""
mean, var = self.gp.predict(X)
std = np.sqrt(var)
# 最好观测值
y_best = np.max(self.y_observed)
# 改进量
with np.errstate(divide='ignore'):
z = (mean - y_best - xi) / std
ei = (mean - y_best - xi) * norm.cdf(z) + std * norm.pdf(z)
ei[std < 1e-10] = 0
return ei
def suggest_next_point(self, bounds):
"""建议下一个评估点"""
# 在多个随机起点优化获取函数
best_ei = -np.inf
best_x = None
for _ in range(self.n_restarts):
x0 = np.random.uniform(bounds[:, 0], bounds[:, 1])
result = minimize(
lambda x: -self._acquisition_expected_improvement(x.reshape(1, -1)),
x0,
bounds=bounds
)
if -result.fun > best_ei:
best_ei = -result.fun
best_x = result.x
return best_x
def observe(self, X, y):
"""记录观测"""
self.X_observed.append(X)
self.y_observed.append(y)
self.gp.fit(self.X_observed, self.y_observed)2. 自动机器学习
class GPBasedNAS:
"""基于 GP 的神经网络架构搜索"""
def __init__(self, search_space):
self.search_space = search_space
self.gp = GaussianProcess(Kernels.rbf)
self.observations = []
def fit(self, architecture, val_accuracy):
"""记录架构性能"""
self.observations.append({
'architecture': architecture,
'accuracy': val_accuracy
})
# 更新 GP
X = np.array([self._encode(a) for a in self.observations])
y = np.array([o['accuracy'] for o in self.observations])
self.gp.fit(X, y)
def suggest(self):
"""建议下一个架构"""
# 使用 EI 获取函数平衡探索与利用
X_candidates = self._generate_candidates(100)
ei = self._expected_improvement(X_candidates)
best_idx = np.argmax(ei)
return self._decode(X_candidates[best_idx])不确定性量化
GP 的一个关键优势是能够提供有原则的不确定性估计:
def uncertainty_analysis(gp, X_test, y_true=None):
"""
GP 不确定性分析
"""
mean, var = gp.predict(X_test)
std = np.sqrt(var)
# 1. 置信区间
confidence_95 = 1.96 * std
# 2. 预测区间(考虑噪声)
prediction_interval = 1.96 * np.sqrt(var + gp.noise_std**2)
# 3. 不确定性的来源
# - 偶然不确定性 (Aleatoric): 观测噪声
# - 认知不确定性 (Epistemic): 模型不确定性(可减少)
epistemic = var
aleatoric = gp.noise_std**2
# 4. 校准分析
if y_true is not None:
coverage = np.mean(
np.abs(y_true - mean) <= 1.96 * np.sqrt(var + gp.noise_std**2)
)
print(f"95% 预测区间的覆盖率: {coverage:.2%}")
return {
'mean': mean,
'std': std,
'epistemic': epistemic,
'aleatoric': aleatoric,
'confidence_95': confidence_95
}参考
Footnotes
-
Rasmussen, C. E., & Williams, C. K. I. (2006). Gaussian Processes for Machine Learning. MIT Press. ↩