RLVR可验证奖励学习
概述
RLVR(Reinforcement Learning with Verifiable Rewards)是2024-2025年在LLM推理能力提升中兴起的重要范式,通过形式化验证器提供可验证的奖励信号来训练模型。1
核心思想
传统RLHF使用人类偏好作为奖励,存在:
- 标注成本高
- 主观性强
- 难以处理精确答案问题
RLVR使用可验证奖励(如数学答案、代码执行结果、形式化证明)替代人类偏好:
┌─────────────────────────────────────────────────────────────┐
│ RLVR核心范式 │
├─────────────────────────────────────────────────────────────┤
│ │
│ LLM生成响应 ──→ 验证器检查 ──→ 可验证奖励 │
│ ↑ │
│ └────────── 强化学习策略更新 ←─────────────────────┘ │
│ │
│ 验证器类型: │
│ - 数学: 答案正确性验证 │
│ - 代码: 执行结果验证 │
│ - 证明: Lean/Isabelle形式化验证 │
│ - 科学: 实验结果匹配 │
└─────────────────────────────────────────────────────────────┘
RLVR vs 传统RLHF
| 特性 | RLHF | RLVR |
|---|---|---|
| 奖励来源 | 人类偏好 | 可验证正确答案 |
| 适用任务 | 开放式生成 | 数学/代码/证明 |
| 标注成本 | 高 | 低(自动验证) |
| 训练稳定性 | 中等 | 高 |
| 效果 | 主观质量提升 | 客观能力提升 |
核心架构
1.1 系统组件
from dataclasses import dataclass
from typing import List, Dict, Optional, Callable
import torch
@dataclass
class RLVRConfig:
"""RLVR配置"""
model_name: str
verifier_type: str # "math", "code", "proof"
batch_size: int = 16
learning_rate: float = 1e-6
max_response_length: int = 2048
temperature: float = 1.0
top_p: float = 0.95
@dataclass
class RLVRFramework:
"""
RLVR框架核心组件
"""
# LLM策略
policy_model: PolicyLM
# 参考模型(用于KL约束)
reference_model: ReferenceLM
# 验证器
verifier: Verifier
# 价值网络(可选)
value_network: Optional[ValueNetwork]
# 优化器
optimizer: torch.optim.Optimizer
def __init__(self, config: RLVRConfig):
self.config = config
self._init_components()
def _init_components(self):
"""初始化各组件"""
# 策略模型
self.policy_model = PolicyLM(self.config.model_name)
# 参考模型(冻结)
self.reference_model = ReferenceLM(self.config.model_name)
self.reference_model.eval()
for param in self.reference_model.parameters():
param.requires_grad = False
# 验证器
self.verifier = self._create_verifier()
# 价值网络
if self.config.use_value_network:
self.value_network = ValueNetwork(self.config.model_name)
else:
self.value_network = None
# 优化器
self.optimizer = torch.optim.Adam(
self.policy_model.parameters(),
lr=self.config.learning_rate
)
class Verifier:
"""
验证器基类
"""
def verify(self, question: str, response: str) -> VerificationResult:
"""
验证响应正确性
Returns:
VerificationResult: 包含正确性和详细信息
"""
raise NotImplementedError
class MathVerifier(Verifier):
"""
数学验证器
"""
def __init__(self):
self.solution_parser = SolutionParser()
self.answer_extractor = AnswerExtractor()
def verify(self, question: str, response: str) -> VerificationResult:
"""
验证数学问题解答
流程:
1. 从问题提取正确答案
2. 从响应提取模型答案
3. 比较并返回结果
"""
# 提取正确答案
ground_truth = self.answer_extractor.extract(question)
# 提取模型答案
model_answer = self.answer_extractor.extract(response)
# 比较
if model_answer is None:
return VerificationResult(
correct=False,
reward=0.0,
error="无法提取答案"
)
# 处理数值容差
if self._numeric_match(model_answer, ground_truth):
return VerificationResult(
correct=True,
reward=1.0,
answer=model_answer
)
else:
return VerificationResult(
correct=False,
reward=0.0,
expected=ground_truth,
actual=model_answer
)
def _numeric_match(self, a, b, tolerance: float = 1e-6) -> bool:
"""数值匹配(考虑容差)"""
if isinstance(a, (int, float)) and isinstance(b, (int, float)):
return abs(a - b) < tolerance
return a == b
class CodeVerifier(Verifier):
"""
代码验证器:执行代码并验证结果
"""
def __init__(self, execution_timeout: int = 10):
self.timeout = execution_timeout
self.executor = CodeExecutor()
def verify(self, question: str, response: str) -> VerificationResult:
"""
验证代码解答
1. 提取代码
2. 执行代码
3. 比较输出
"""
# 提取代码
code = self.code_extractor.extract(response)
if code is None:
return VerificationResult(correct=False, reward=0.0)
# 提取测试用例
test_cases = self.extract_test_cases(question)
# 执行并验证
all_passed = True
outputs = []
for test in test_cases:
try:
result = self.executor.execute(
code,
test.input,
timeout=self.timeout
)
if result.output != test.expected_output:
all_passed = False
outputs.append({
'input': test.input,
'expected': test.expected_output,
'actual': result.output,
'passed': False
})
else:
outputs.append({
'input': test.input,
'expected': test.expected_output,
'actual': result.output,
'passed': True
})
except ExecutionError as e:
all_passed = False
outputs.append({
'error': str(e),
'passed': False
})
return VerificationResult(
correct=all_passed,
reward=1.0 if all_passed else 0.0,
test_outputs=outputs
)1.2 奖励设计
@dataclass
class RewardShaping:
"""
奖励塑形:设计更丰富的奖励信号
"""
@staticmethod
def binary_reward(correct: bool) -> float:
"""
二值奖励:最终正确性
"""
return 1.0 if correct else 0.0
@staticmethod
def step_reward(correct: bool, n_steps: int,
step: int) -> float:
"""
分步奖励:考虑步数
"""
if correct:
# 正确答案:步数越少越好
return 1.0 + 0.1 * (n_steps - step) / n_steps
else:
# 错误答案:提供部分奖励
return 0.0
@staticmethod
def outcome_based_reward(
correct: bool,
partial_correct: float = 0.0
) -> float:
"""
结果奖励:处理部分正确
"""
if correct:
return 1.0
return partial_correct
@staticmethod
def sparse_reward_with_hints(
correct: bool,
hint_level: int
) -> float:
"""
带提示的稀疏奖励
"""
if correct:
return 1.0 - 0.1 * hint_level
return 0.0
class ProcessRewardCalculator:
"""
过程奖励计算器
"""
def __init__(self, prm: 'ProcessRewardModel'):
self.prm = prm
def calculate_step_rewards(
self,
question: str,
response_steps: List[str]
) -> List[float]:
"""
计算每个步骤的过程奖励
"""
rewards = []
# 构建累积上下文
context = question
for i, step in enumerate(response_steps):
# 评估当前步骤质量
step_reward = self.prm.score(
context=context,
step=step,
is_final=(i == len(response_steps) - 1)
)
rewards.append(step_reward)
# 更新上下文
context += f"\n{step}"
return rewards强化学习算法
2.1 PPO for RLVR
class RLVRPPO:
"""
RLVR的PPO实现
"""
def __init__(self, config: RLVRConfig):
self.config = config
self.framework = RLVRFramework(config)
# PPO超参数
self.ppo_epochs = 4
self.clip_eps = 0.2
self.gamma = 1.0 # RLVR通常不需要折扣
self.lambda_ = 0.95
self.entropy_coef = 0.01
self.kl_coef = 0.01
def train_step(self, batch: List[PromptResponse]) -> Dict:
"""
单步训练
"""
# 生成响应
responses, log_probs, values = self._generate_responses(batch)
# 计算奖励
rewards = self._compute_rewards(batch, responses)
# 计算优势
advantages, returns = self._compute_gae(rewards, values)
# PPO更新
stats = self._ppo_update(
batch, responses, log_probs, values, advantages, returns
)
return stats
def _generate_responses(
self,
batch: List[PromptResponse]
) -> tuple:
"""
使用当前策略生成响应
"""
all_responses = []
all_log_probs = []
all_values = []
for prompt in batch:
with torch.no_grad():
# 采样生成
response, log_prob = self.framework.policy_model.sample(
prompt.text,
temperature=self.config.temperature,
max_length=self.config.max_response_length
)
# 价值估计(如果使用)
if self.framework.value_network:
value = self.framework.value_network(prompt.text, response)
else:
value = 0.0
all_responses.append(response)
all_log_probs.append(log_prob)
all_values.append(value)
return all_responses, all_log_probs, all_values
def _compute_rewards(
self,
batch: List[PromptResponse],
responses: List[str]
) -> List[float]:
"""
计算验证奖励
"""
rewards = []
for prompt, response in zip(batch, responses):
result = self.framework.verifier.verify(
prompt.text,
response
)
rewards.append(result.reward)
return rewards
def _ppo_update(
self,
batch: List[PromptResponse],
responses: List[str],
old_log_probs: List[torch.Tensor],
values: List[float],
advantages: List[float],
returns: List[float]
) -> Dict:
"""
PPO策略更新
"""
stats = {
'policy_loss': [],
'value_loss': [],
'entropy': [],
'kl_div': []
}
for _ in range(self.ppo_epochs):
for i, (prompt, response) in enumerate(zip(batch, responses)):
# 新策略的log_prob
new_log_prob = self.framework.policy_model.log_prob(
prompt.text,
response
)
# 重要性采样比率
ratio = torch.exp(new_log_prob - old_log_probs[i])
# 裁剪
surr1 = ratio * advantages[i]
surr2 = torch.clamp(
ratio,
1 - self.clip_eps,
1 + self.clip_eps
) * advantages[i]
# 策略损失
policy_loss = -torch.min(surr1, surr2).mean()
# 熵正则化
entropy = self.framework.policy_model.entropy(prompt.text)
# KL散度惩罚
with torch.no_grad():
kl_div = (old_log_probs[i] - new_log_prob).mean()
# 总损失
loss = (
policy_loss
- self.entropy_coef * entropy
+ self.kl_coef * kl_div
)
# 反向传播
self.framework.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(1.0)
self.framework.optimizer.step()
stats['policy_loss'].append(policy_loss.item())
stats['entropy'].append(entropy.item())
stats['kl_div'].append(kl_div.item())
return {k: np.mean(v) for k, v in stats.items()}2.2 GRPO变体
class GRPO:
"""
GRPO (Group Relative Policy Optimization)
用于DeepSeekMath等模型
"""
def __init__(self, config: RLVRConfig):
self.config = config
self.framework = RLVRFramework(config)
def train_step(self, questions: List[str]) -> Dict:
"""
GRPO单步训练
"""
# 为每个问题采样多个响应
responses_per_q = []
log_probs_per_q = []
for q in questions:
responses, log_probs = self._sample_group(q, n=8)
responses_per_q.append(responses)
log_probs_per_q.append(log_probs)
# 计算组内相对优势
advantages = []
for q, responses, old_log_probs in zip(
questions, responses_per_q, log_probs_per_q
):
# 验证奖励
rewards = [self.framework.verifier.verify(q, r).reward
for r in responses]
# 组内标准化
rewards_tensor = torch.tensor(rewards)
mean = rewards_tensor.mean()
std = rewards_tensor.std() + 1e-8
advantages_per_q = ((rewards_tensor - mean) / std).tolist()
advantages.extend(advantages_per_q)
# 策略更新
loss = self._compute_grpo_loss(
questions, responses_per_q, log_probs_per_q, advantages
)
self.framework.optimizer.zero_grad()
loss.backward()
self.framework.optimizer.step()
return {'loss': loss.item()}
def _compute_grpo_loss(
self,
questions: List[str],
responses_per_q: List[List[str]],
old_log_probs_per_q: List[List[torch.Tensor]],
advantages: List[float]
) -> torch.Tensor:
"""
计算GRPO损失
"""
total_loss = 0
adv_idx = 0
for q, responses, old_log_probs in zip(
questions, responses_per_q, old_log_probs_per_q
):
for response, old_log_prob in zip(responses, old_log_probs):
new_log_prob = self.framework.policy_model.log_prob(q, response)
ratio = torch.exp(new_log_prob - old_log_prob)
loss = -advantages[adv_idx] * ratio
total_loss += loss
adv_idx += 1
return total_loss / len(advantages)Process Reward Model (PRM)
3.1 PRM架构
class ProcessRewardModel(nn.Module):
"""
过程奖励模型:评估每个推理步骤
"""
def __init__(self, base_model_name: str, hidden_size: int = 768):
super().__init__()
# 共享编码器
self.encoder = AutoModel.from_pretrained(base_model_name)
# 过程奖励头
self.process_head = nn.Sequential(
nn.Linear(hidden_size, hidden_size // 2),
nn.ReLU(),
nn.Linear(hidden_size // 2, 1),
nn.Sigmoid() # 输出0-1之间的概率
)
def forward(
self,
context: str,
step: str,
return_embedding: bool = False
) -> Dict:
"""
评估单个步骤
Returns:
step_reward: 步骤奖励 (0-1)
embedding: 中间表示(可选)
"""
# 编码上下文和步骤
context_emb = self.encoder(context)
step_emb = self.encoder(step)
# 组合
combined = torch.cat([context_emb, step_emb], dim=-1)
# 奖励预测
reward = self.process_head(combined)
if return_embedding:
return {'reward': reward, 'embedding': combined}
return {'reward': reward}
class StepLevelDataCollector:
"""
分步数据收集器:收集用于训练PRM的数据
"""
def __init__(self, prm: ProcessRewardModel, verifier: Verifier):
self.prm = prm
self.verifier = verifier
def collect(self, questions: List[str], n_samples: int = 8) -> List[StepData]:
"""
收集分步训练数据
"""
all_step_data = []
for q in questions:
# 生成多个样本
samples = self._generate_samples(q, n_samples)
for sample in samples:
# 分割为步骤
steps = self._split_steps(sample.response)
# 收集每个步骤
for i, step in enumerate(steps):
# 最终奖励
final_reward = self.verifier.verify(q, sample.response).reward
# 中间信号(如果有)
intermediate = self._get_intermediate_signal(q, steps[:i+1])
all_step_data.append({
'question': q,
'steps': steps[:i+1],
'current_step': step,
'final_reward': final_reward,
'intermediate_signal': intermediate,
'is_correct': final_reward > 0.5
})
return all_step_data
def _split_steps(self, response: str) -> List[str]:
"""
将响应分割为逻辑步骤
"""
# 使用思维链分割
lines = response.split('\n')
steps = []
current_step = []
for line in lines:
if line.strip().startswith(('1.', '2.', '3.',
'Step', '首先', '其次', '最后')):
if current_step:
steps.append('\n'.join(current_step))
current_step = [line]
else:
current_step.append(line)
if current_step:
steps.append('\n'.join(current_step))
return steps3.2 PRM训练
class PRMTrainer:
"""
PRM训练器
"""
def __init__(self, prm: ProcessRewardModel):
self.prm = prm
self.optimizer = torch.optim.AdamW(prm.parameters(), lr=1e-5)
def train(
self,
train_data: List[StepData],
val_data: List[StepData],
n_epochs: int = 3
) -> Dict:
"""
训练PRM
"""
for epoch in range(n_epochs):
# 训练
train_stats = self._train_epoch(train_data)
# 验证
val_stats = self._validate(val_data)
print(f"Epoch {epoch+1}: "
f"Train Acc: {train_stats['accuracy']:.3f}, "
f"Val Acc: {val_stats['accuracy']:.3f}")
return {'train': train_stats, 'val': val_stats}
def _train_epoch(self, data: List[StepData]) -> Dict:
"""
单轮训练
"""
self.prm.train()
total_loss = 0
correct = 0
total = 0
for batch in self._batchify(data):
# 前向传播
rewards = []
for item in batch:
context = item['question'] + '\n' + '\n'.join(item['steps'][:-1])
result = self.prm(context, item['current_step'])
rewards.append(result['reward'])
# 标签
labels = torch.tensor([1.0 if d['is_correct'] else 0.0
for d in batch])
predictions = torch.cat(rewards).squeeze()
# 损失
loss = nn.BCELoss()(predictions, labels)
# 反向传播
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
total_loss += loss.item()
correct += ((predictions > 0.5) == labels).sum().item()
total += len(labels)
return {
'loss': total_loss / len(data),
'accuracy': correct / total
}应用案例
4.1 DeepSeekMath
class DeepSeekMathRLVR:
"""
DeepSeekMath的RLVR实现
"""
def __init__(self):
self.model = load_model("deepseek-math-7b")
self.verifier = MathVerifier()
self.grpo = GRPO(RLVRConfig(model_name="deepseek-math-7b"))
def train(self, math_problems: List[str], n_steps: int = 1000):
"""
RLVR训练
"""
for step in range(n_steps):
# GRPO更新
stats = self.grpo.train_step(math_problems)
# 评估
if step % 100 == 0:
accuracy = self.evaluate(math_problems)
print(f"Step {step}: Accuracy = {accuracy:.3f}")
def evaluate(self, problems: List[str]) -> float:
"""
评估准确率
"""
correct = 0
for p in problems:
response = self.model.generate(p)
result = self.verifier.verify(p, response)
if result.correct:
correct += 1
return correct / len(problems)4.2 代码验证训练
class CodeRLVR:
"""
代码生成的RLVR
"""
def __init__(self):
self.model = load_model("code-llm")
self.verifier = CodeVerifier()
def train(self, coding_problems: List[str], n_iterations: int = 500):
"""
代码RLVR训练
"""
for iteration in range(n_iterations):
# 采样代码
codes = self.model.sample(coding_problems)
# 验证
results = [self.verifier.verify(p, c)
for p, c in zip(coding_problems, codes)]
# 过滤正确样本
correct_codes = [c for c, r in zip(codes, results) if r.correct]
# 使用正确样本微调
if len(correct_codes) > 0:
self.model.fine_tune(correct_codes)与其他范式的比较
5.1 范式对比
| 范式 | 奖励来源 | 适用场景 | 代表工作 |
|---|---|---|---|
| RLHF | 人类偏好 | 开放式生成 | InstructGPT |
| DPO | 人类偏好 | 开放式生成 | Zephyr |
| RLAIF | AI偏好 | 开放式生成 | Constitutional AI |
| RLVR | 可验证答案 | 数学/代码/证明 | DeepSeekMath |
| PPO-PRM | 过程奖励 | 复杂推理 | OpenMathInstruct |
5.2 优缺点分析
┌─────────────────────────────────────────────────────────────┐
│ RLVR优缺点分析 │
├─────────────────────────────────────────────────────────────┤
│ 优点: │
│ ✓ 自动奖励,无需人工标注 │
│ ✓ 训练稳定,收敛性好 │
│ ✓ 可扩展到大规模数据 │
│ ✓ 适合客观评测任务 │
│ │
│ 缺点: │
│ ✗ 需要可验证答案(不适用于开放任务) │
│ ✗ 奖励稀疏(只有最终奖励) │
│ ✗ 可能过度优化"答案格式"而非真正推理 │
│ ✗ 验证器质量直接影响效果 │
└─────────────────────────────────────────────────────────────┘
未来方向
6.1 当前挑战
| 挑战 | 描述 | 解决方案 |
|---|---|---|
| 奖励稀疏 | 只有最终奖励 | PRM过程奖励 |
| 格式过拟合 | 过度优化格式 | 多样化训练 |
| 验证器局限 | 无法验证所有任务 | 组合验证 |
| 长程推理 | 复杂问题推理链长 | 思维链增强 |
6.2 研究前沿
- 多验证器组合:数学+代码+证明联合
- 自适应PRM:根据问题难度选择奖励粒度
- 自我验证:模型自己验证答案
- 课程RLVR:从简单到复杂逐步训练
总结
RLVR范式代表了LLM推理能力提升的重要方向:
- 可验证奖励:使用形式化验证替代人类偏好
- 自动标注:大幅降低训练成本
- 稳定训练:避免RLHF的不稳定性
- 可扩展:适用于数学、代码、证明等领域
与神经定理证明(AlphaProof、AlphaGeometry)的结合,正在推动AI数学能力向更高水平发展。
参考资料
相关文档
- alphageometry-system-deep-dive — AlphaGeometry几何定理证明
- alphaproof-formal-math-reasoning — AlphaProof形式化推理
- chain-of-thought-reasoning — 思维链推理
- process-reward-model — 过程奖励模型
Footnotes
-
Anthropic (2024). “Scaling Reinforcement Learning with Verifiable Rewards.” ↩