高斯过程基础

高斯过程(Gaussian Process, GP)是定义在连续域上的随机过程,其任意有限个点的联合分布为多元高斯分布。GP为函数提供了贝叶斯先验,是核方法与贝叶斯推断的统一框架。1


定义与基本概念

高斯过程的定义

高斯过程 是定义在连续输入空间 上的随机过程,满足:

其中:

  • :均值函数
  • :协方差函数(核函数)

对于任意有限点集 ,函数值 的联合分布为:

其中

import numpy as np
import matplotlib.pyplot as plt
 
class GaussianProcess:
    """高斯过程回归实现"""
    
    def __init__(self, kernel, noise_std=0.1):
        self.kernel = kernel
        self.noise_std = noise_std
        self.X_train = None
        self.y_train = None
        self.K = None
        self.K_inv = None
    
    def fit(self, X, y):
        """训练 GP(计算协方差矩阵的逆)"""
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        
        # 计算协方差矩阵 K(X, X)
        self.K = self.kernel(self.X_train, self.X_train)
        
        # 添加观测噪声
        self.K += self.noise_std**2 * np.eye(len(X))
        
        # 计算逆矩阵(使用 Cholesky 分解以提高数值稳定性)
        try:
            self.L = np.linalg.cholesky(self.K)
            self.K_inv = np.linalg.inv(self.K)
        except np.linalg.LinAlgError:
            # 如果病态,添加小量正则化
            self.K += 1e-6 * np.eye(len(X))
            self.K_inv = np.linalg.inv(self.K)
    
    def predict(self, X_star):
        """
        预测新输入点的后验均值和方差
        
        Returns:
            mean: 预测均值
            var: 预测方差
        """
        X_star = np.array(X_star)
        
        # 计算训练点与新点之间的协方差
        K_star = self.kernel(self.X_train, X_star)  # (n_train, n_star)
        K_star_star = self.kernel(X_star, X_star)   # (n_star, n_star)
        
        # 后验均值: μ_* = K_*^T K^{-1} y
        mean = K_star.T @ self.K_inv @ self.y_train
        
        # 后验方差: σ²_* = K_{**} - K_*^T K^{-1} K_*
        var = np.diag(K_star_star - K_star.T @ self.K_inv @ K_star)
        var = np.maximum(var, 0)  # 确保非负
        
        return mean, var
    
    def sample(self, X, n_samples=1):
        """从 GP 先验/后验采样"""
        mean = np.zeros(len(X))
        cov = self.kernel(X, X)
        
        # 添加观测噪声
        cov += 1e-6 * np.eye(len(X))
        
        return np.random.multivariate_normal(mean, cov, n_samples)
 
# 定义核函数
def rbf_kernel(X1, X2, lengthscale=1.0, variance=1.0):
    """
    RBF (径向基函数) 核 / 高斯核
    
    k(x, x') = σ² exp(-‖x - x'‖² / (2ℓ²))
    """
    X1 = np.array(X1)
    X2 = np.array(X2)
    
    # 计算欧氏距离的平方
    dist_sq = (
        X1[:, np.newaxis, :]**2 + 
        X2[np.newaxis, :, :]**2 - 
        2 * X1[:, np.newaxis, :] @ X2[np.newaxis, :, :].transpose(0, 2, 1)
    )
    
    return variance * np.exp(-dist_sq / (2 * lengthscale**2))

核函数(协方差函数)

核函数决定了 GP 先验中函数的性质,是 GP 的核心设计选择。

常用核函数

核函数公式特性
RBF/Squared Exponential$\sigma^2 \exp(-\frac{\x-x’\
Matérn 3/2中等粗糙度
Matérn 5/2更平滑
Periodic周期性模式
Linear线性函数
Constant常数偏移
class Kernels:
    """常用核函数集合"""
    
    @staticmethod
    def rbf(X1, X2, lengthscale=1.0, variance=1.0):
        """RBF 核"""
        X1 = np.atleast_2d(X1)
        X2 = np.atleast_2d(X2)
        
        # pairwise distances
        dist_sq = np.sum((X1[:, np.newaxis] - X2[np.newaxis, :])**2, axis=-1)
        return variance * np.exp(-0.5 * dist_sq / lengthscale**2)
    
    @staticmethod
    def mattern32(X1, X2, lengthscale=1.0, variance=1.0):
        """Matérn 3/2 核"""
        X1 = np.atleast_2d(X1)
        X2 = np.atleast_2d(X2)
        
        d = np.sqrt(np.sum((X1[:, np.newaxis] - X2[np.newaxis, :])**2, axis=-1))
        sqrt_3 = np.sqrt(3)
        
        return variance * (1 + sqrt_3 * d / lengthscale) * np.exp(-sqrt_3 * d / lengthscale)
    
    @staticmethod
    def mattern52(X1, X2, lengthscale=1.0, variance=1.0):
        """Matérn 5/2 核"""
        X1 = np.atleast_2d(X1)
        X2 = np.atleast_2d(X2)
        
        d = np.sqrt(np.sum((X1[:, np.newaxis] - X2[np.newaxis, :])**2, axis=-1))
        sqrt_5 = np.sqrt(5)
        
        return variance * (
            1 + sqrt_5 * d / lengthscale + 5 * d**2 / (3 * lengthscale**2)
        ) * np.exp(-sqrt_5 * d / lengthscale)
    
    @staticmethod
    def periodic(X1, X2, lengthscale=1.0, variance=1.0, period=1.0):
        """周期核"""
        X1 = np.atleast_2d(X1)
        X2 = np.atleast_2d(X2)
        
        d = np.minimum(
            np.abs(X1[:, np.newaxis] - X2[np.newaxis, :]),
            2*np.pi - np.abs(X1[:, np.newaxis] - X2[np.newaxis, :])
        )
        
        return variance * np.exp(-2 * np.sin(d * np.pi / period)**2 / lengthscale**2)
    
    @staticmethod
    def linear(X1, X2, variance=1.0, c=0.0):
        """线性核"""
        X1 = np.atleast_2d(X1)
        X2 = np.atleast_2d(X2)
        
        return variance * (X1 - c) @ (X2 - c).T
    
    @staticmethod
    def composite(ker1, ker2, op='add'):
        """核函数组合"""
        def combined(X1, X2, **kwargs):
            if op == 'add':
                return ker1(X1, X2, **kwargs) + ker2(X1, X2, **kwargs)
            elif op == 'mult':
                return ker1(X1, X2, **kwargs) * ker2(X1, X2, **kwargs)
        return combined

核函数的组合

核函数可以通过加法和乘法组合:

def combined_kernel(X1, X2):
    """组合核:RBF + Linear + Periodic"""
    k_rbf = Kernels.rbf(X1, X2, lengthscale=0.5)
    k_lin = Kernels.linear(X1, X2, variance=0.1, c=0)
    k_per = Kernels.periodic(X1, X2, lengthscale=1.0, period=2*np.pi)
    
    return k_rbf + k_lin + k_per

GP 回归与推断

观测模型

设观测 ,其中

后验预测

给定训练数据 ,新点 的预测分布为:

其中:

def gp_regression(X_train, y_train, X_test, kernel, noise_std=0.1):
    """
    高斯过程回归
    
    Args:
        X_train: 训练输入 (n, d)
        y_train: 训练输出 (n,)
        X_test: 测试输入 (m, d)
        kernel: 核函数
        noise_std: 观测噪声标准差
    
    Returns:
        mean: 预测均值 (m,)
        var: 预测方差 (m,)
    """
    n = len(X_train)
    m = len(X_test)
    
    # 计算协方差矩阵
    K = kernel(X_train, X_train) + noise_std**2 * np.eye(n)
    K_star = kernel(X_train, X_test)
    K_star_star = kernel(X_test, X_test)
    
    # Cholesky 分解(更数值稳定)
    L = np.linalg.cholesky(K)
    
    # 后验均值
    alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
    mean = K_star.T @ alpha
    
    # 后验方差(对角线)
    v = np.linalg.solve(L, K_star)
    var = np.diag(K_star_star - v.T @ v)
    var = np.maximum(var, 1e-10)
    
    return mean, var, np.sqrt(var)
 
# 示例:使用 GP 回归拟合函数
def demo_gp_regression():
    # 真实函数
    def true_function(x):
        return np.sin(x * 3) + 0.5 * np.sin(x * 7)
    
    # 训练数据
    np.random.seed(42)
    X_train = np.linspace(0, 2*np.pi, 20)[:, np.newaxis]
    y_train = true_function(X_train) + np.random.normal(0, 0.1, X_train.shape)
    
    # 测试点
    X_test = np.linspace(0, 2*np.pi, 200)[:, np.newaxis]
    
    # GP 回归
    mean, var = None, None
    def k(X1, X2):
        return Kernels.rbf(X1, X2, lengthscale=0.5) + Kernels.mattern32(X1, X2, lengthscale=1.0)
    
    mean, var, std = gp_regression(X_train, y_train, X_test, k, noise_std=0.1)
    
    print(f"预测范围: [{mean.min():.3f}, {mean.max():.3f}]")
    print(f"预测不确定性: [{std.min():.3f}, {std.max():.3f}]")
    
    return X_train, y_train, X_test, mean, std

GP 的计算复杂度

问题

直接计算 GP 的协方差矩阵求逆需要 时间,其中 是训练样本数。这限制了 GP 在大规模数据集上的应用。

近似方法

class SparseGP:
    """
    稀疏高斯过程(使用诱导点)
    
    将计算复杂度从 O(n^3) 降低到 O(m^2 n + m^3),其中 m 是诱导点数量
    """
    
    def __init__(self, kernel, n_inducing=50):
        self.kernel = kernel
        self.n_inducing = n_inducing
        self.inducing_inputs = None
        self.inducing_mean = None
        self.inducing_cov = None
    
    def fit(self, X, y):
        """选择诱导点并拟合"""
        self.inducing_inputs = self._select_inducing_points(X)
        
        # 计算诱导点的协方差
        K_uu = self.kernel(self.inducing_inputs, self.inducing_inputs)
        K_uf = self.kernel(self.inducing_inputs, X)
        
        # 变分参数
        self.q_mean = np.zeros(self.n_inducing)
        self.q_cov = np.eye(self.n_inducing)
    
    def predict(self, X_star):
        """预测新点"""
        K_uu = self.kernel(self.inducing_inputs, self.inducing_inputs)
        K_su = self.kernel(X_star, self.inducing_inputs)
        
        # 预测均值
        K_uu_inv = np.linalg.inv(K_uu)
        mean = K_su @ K_uu_inv @ self.q_mean
        
        return mean
    
    def _select_inducing_points(self, X):
        """选择诱导点(使用随机采样或 k-means)"""
        indices = np.random.choice(len(X), self.n_inducing, replace=False)
        return X[indices]

其他近似方法

方法复杂度原理
Nyström 近似使用 个诱导点近似核矩阵
随机特征逼近将核函数转化为随机傅里叶特征
稀疏诱导点变分推断,边缘化诱导变量
分布式 GP数据分片,并行计算
class RandomFourierFeatures:
    """
    随机傅里叶特征 (Random Fourier Features)
    
    将 RBF 核近似为有限维随机投影的内积
    """
    
    def __init__(self, kernel, n_features=100, lengthscale=1.0):
        self.kernel = kernel
        self.n_features = n_features
        self.lengthscale = lengthscale
        self.W = None
        self.b = None
    
    def fit(self, X):
        """采样随机权重"""
        d = X.shape[1] if len(X.shape) > 1 else 1
        
        # 采样频率
        self.W = np.random.normal(0, 1/lengthscale, (d, self.n_features // 2))
        self.b = np.random.uniform(0, 2*np.pi, self.n_features // 2)
    
    def transform(self, X):
        """转换为随机特征"""
        X = np.atleast_2d(X)
        projection = X @ self.W + self.b
        
        # 三角特征
        z = np.concatenate([np.cos(projection), np.sin(projection)], axis=1)
        
        # 归一化
        return np.sqrt(2 / self.n_features) * z

GP 与神经网络的联系

无限宽神经网络 = GP

深度学习理论的一个重要发现:具有随机初始化的无限宽神经网络等价于 GP。

def neural_tangent_kernel_gp():
    """
    神经正切核 (NTK) 与 GP 的等价性
    
    对于无限宽的网络:
    - 预测均值 = 网络输出
    - 预测方差 = NTK 的函数空间
    """
    
    # NTK 的核函数定义
    def ntk_kernel(x1, x2, depth=2, activation='relu'):
        """
        计算 NTK 核
        
        无限宽网络输出服从 GP,协方差由 NTK 给出
        """
        if activation == 'relu':
            def phi(z):
                return np.maximum(z, 0)
            
            def phi_prime(z):
                return (z > 0).astype(float)
        
        # 第一层贡献
        K = x1 @ x2.T
        
        return K  # 简化版本

GP 作为贝叶斯后验

GP 可以看作贝叶斯线性回归在无限维函数空间中的推广:

组件有限维(线性回归)无限维(GP)
参数 函数
先验
似然$p(yx,w) = \mathcal{N}(w^Tx, \sigma_n^2)$
后验$p(wy) = \mathcal{N}(\mu, \Sigma)$

GP 的应用

1. 超参数优化(贝叶斯优化)

class BayesianOptimization:
    """
    基于 GP 的贝叶斯优化
    """
    
    def __init__(self, kernel, n_restarts=5):
        self.gp = GaussianProcess(kernel)
        self.n_restarts = n_restarts
        self.X_observed = []
        self.y_observed = []
    
    def _acquisition_expected_improvement(self, X, xi=0.01):
        """期望改进 (Expected Improvement) 获取函数"""
        mean, var = self.gp.predict(X)
        std = np.sqrt(var)
        
        # 最好观测值
        y_best = np.max(self.y_observed)
        
        # 改进量
        with np.errstate(divide='ignore'):
            z = (mean - y_best - xi) / std
            ei = (mean - y_best - xi) * norm.cdf(z) + std * norm.pdf(z)
            ei[std < 1e-10] = 0
        
        return ei
    
    def suggest_next_point(self, bounds):
        """建议下一个评估点"""
        # 在多个随机起点优化获取函数
        best_ei = -np.inf
        best_x = None
        
        for _ in range(self.n_restarts):
            x0 = np.random.uniform(bounds[:, 0], bounds[:, 1])
            result = minimize(
                lambda x: -self._acquisition_expected_improvement(x.reshape(1, -1)),
                x0,
                bounds=bounds
            )
            
            if -result.fun > best_ei:
                best_ei = -result.fun
                best_x = result.x
        
        return best_x
    
    def observe(self, X, y):
        """记录观测"""
        self.X_observed.append(X)
        self.y_observed.append(y)
        self.gp.fit(self.X_observed, self.y_observed)

2. 自动机器学习

class GPBasedNAS:
    """基于 GP 的神经网络架构搜索"""
    
    def __init__(self, search_space):
        self.search_space = search_space
        self.gp = GaussianProcess(Kernels.rbf)
        self.observations = []
    
    def fit(self, architecture, val_accuracy):
        """记录架构性能"""
        self.observations.append({
            'architecture': architecture,
            'accuracy': val_accuracy
        })
        
        # 更新 GP
        X = np.array([self._encode(a) for a in self.observations])
        y = np.array([o['accuracy'] for o in self.observations])
        self.gp.fit(X, y)
    
    def suggest(self):
        """建议下一个架构"""
        # 使用 EI 获取函数平衡探索与利用
        X_candidates = self._generate_candidates(100)
        ei = self._expected_improvement(X_candidates)
        best_idx = np.argmax(ei)
        
        return self._decode(X_candidates[best_idx])

不确定性量化

GP 的一个关键优势是能够提供有原则的不确定性估计

def uncertainty_analysis(gp, X_test, y_true=None):
    """
    GP 不确定性分析
    """
    mean, var = gp.predict(X_test)
    std = np.sqrt(var)
    
    # 1. 置信区间
    confidence_95 = 1.96 * std
    
    # 2. 预测区间(考虑噪声)
    prediction_interval = 1.96 * np.sqrt(var + gp.noise_std**2)
    
    # 3. 不确定性的来源
    # - 偶然不确定性 (Aleatoric): 观测噪声
    # - 认知不确定性 (Epistemic): 模型不确定性(可减少)
    
    epistemic = var
    aleatoric = gp.noise_std**2
    
    # 4. 校准分析
    if y_true is not None:
        coverage = np.mean(
            np.abs(y_true - mean) <= 1.96 * np.sqrt(var + gp.noise_std**2)
        )
        print(f"95% 预测区间的覆盖率: {coverage:.2%}")
    
    return {
        'mean': mean,
        'std': std,
        'epistemic': epistemic,
        'aleatoric': aleatoric,
        'confidence_95': confidence_95
    }

参考

Footnotes

  1. Rasmussen, C. E., & Williams, C. K. I. (2006). Gaussian Processes for Machine Learning. MIT Press.