语音合成与生成
1. 概述
语音合成(Text-to-Speech, TTS)是将文本转换为自然语音的技术。现代TTS系统通常采用两阶段架构:文本到声学特征 + 声学特征到波形。
1.1 TTS系统架构
文本 → 文本分析 → 韵律预测 → 声学模型 → 声码器 → 语音波形
↓ ↓ ↓ ↓
分词 音素时长 F0/能量 波形生成
韵律 语调 频谱 质量
1.2 发展历程
传统: 拼接合成 → 参数合成 → 神经网络合成
↓
Tacotron → FastSpeech → VALL-E → 音频LLM
↓ ↓
WaveNet → WaveRNN/HiFi-GAN
2. 声学特征基础
2.1 梅尔频谱
import librosa
import numpy as np
def text_to_mel(text, sample_rate=22050):
"""文本到梅尔频谱的转换(示意)"""
# 文本到音素
phonemes = text_to_phonemes(text)
# 音素到梅尔频谱
mel = generate_mel_from_phonemes(phonemes)
return mel
def compute_mel_spectrogram(waveform, sr=22050, n_fft=1024, hop_length=256, n_mels=80):
"""计算梅尔频谱"""
# STFT
D = librosa.stft(waveform, n_fft=n_fft, hop_length=hop_length)
# 幅度谱
magnitude = np.abs(D)
# 梅尔滤波器组
mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
# 梅尔频谱
mel_spec = np.dot(mel_basis, magnitude)
# 对数梅尔频谱
log_mel = librosa.power_to_db(mel_spec, ref=np.max)
return log_mel2.2 声码器基础
声码器将声学特征转换为波形:
class Vocoder(nn.Module):
"""声码器基类"""
def forward(self, mel):
"""mel: (B, n_mels, T) → waveform: (B, 1, T*hop_length)"""
raise NotImplementedError3. Tacotron系列
3.1 Tacotron 1
Tacotron是Google提出的端到端TTS系统:
class Tacotron(nn.Module):
def __init__(self, vocab_size, embed_dim=512, encoder_dim=256,
decoder_dim=1024, n_mels=80):
super().__init__()
self.n_mels = n_mels
# 编码器
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.encoder = nn.Sequential(
nn.Linear(embed_dim, encoder_dim),
*[
CBHG(encoder_dim, k=16) for _ in range(2)
]
)
# 注意力解码器
self.attention = LocationSensitiveAttention(encoder_dim * 2, decoder_dim)
self.decoder_rnn1 = nn.GRU(encoder_dim * 2 + decoder_dim, decoder_dim, batch_first=True)
self.decoder_rnn2 = nn.GRU(decoder_dim, decoder_dim, batch_first=True)
# 输出层
self.linear = nn.Linear(decoder_dim, n_mels)
self.stop_token = nn.Linear(decoder_dim, 1)
def forward(self, text, target_mel=None, max_len=200):
# 编码
text_emb = self.embedding(text) # (B, T_text, embed_dim)
memory = self.encoder(text_emb) # (B, T_text, encoder_dim*2)
# 解码
B = text.size(0)
mel_out = []
# 初始解码器状态
decoder_input = torch.zeros(B, self.n_mels).to(text.device)
prev_attention = torch.zeros(B, text.size(1)).to(text.device)
for t in range(max_len):
# 注意力
attention_weights, context = self.attention(
decoder_input, memory, prev_attention
)
# 解码
decoder_input = torch.cat([context, decoder_input], dim=-1)
rnn1_out, _ = self.decoder_rnn1(decoder_input.unsqueeze(1))
rnn2_out, _ = self.decoder_rnn2(rnn1_out)
# 输出
mel_pred = self.linear(rnn2_out.squeeze(1)) # (B, n_mels)
stop_pred = self.stop_token(rnn2_out.squeeze(1)) # (B, 1)
mel_out.append(mel_pred)
# 停止条件
if torch.sigmoid(stop_pred) > 0.5:
break
mel_out = torch.stack(mel_out, dim=1) # (B, T_mel, n_mels)
return mel_out3.2 Location-Sensitive Attention
class LocationSensitiveAttention(nn.Module):
def __init__(self, query_dim, key_dim, att_dim=256):
super().__init__()
self.query_proj = nn.Linear(query_dim, att_dim)
self.key_proj = nn.Linear(key_dim, att_dim)
self.value_proj = nn.Linear(key_dim, key_dim)
# 位置编码
self.location_conv = nn.Conv1d(2, 32, 31, padding=15)
self.location_proj = nn.Linear(32, att_dim)
self.v = nn.Linear(att_dim, 1)
def forward(self, query, keys, prev_attention):
"""
query: (B, query_dim) - 解码器输出
keys: (B, T, key_dim) - 编码器记忆
prev_attention: (B, T) - 前一帧注意力权重
"""
# 位置特征
B, T = prev_attention.shape
location_feat = torch.stack([prev_attention, prev_attention], dim=1) # (B, 2, T)
location_feat = self.location_conv(location_feat) # (B, 32, T)
location_feat = location_feat.transpose(1, 2) # (B, T, 32)
location_feat = self.location_proj(location_feat) # (B, T, att_dim)
# 投影
Q = self.query_proj(query.unsqueeze(1)) # (B, 1, att_dim)
K = self.key_proj(keys) + location_feat # (B, T, att_dim)
V = self.value_proj(keys) # (B, T, key_dim)
# 注意力分数
scores = self.v(torch.tanh(Q + K)).squeeze(-1) # (B, T)
# softmax
attention_weights = F.softmax(scores, dim=-1) # (B, T)
# 上下文向量
context = torch.bmm(attention_weights.unsqueeze(1), V) # (B, 1, key_dim)
return attention_weights, context.squeeze(1)4. FastSpeech
4.1 非自回归设计
FastSpeech采用非自回归设计,实现并行生成:
- 并行合成:一次性生成完整梅尔频谱
- 可预测长度:使用时长预测器控制输出长度
- 可控制:通过调整时长实现语速控制
class FastSpeech(nn.Module):
def __init__(self, vocab_size, n_mels=80, d_model=256, n_layers=4):
super().__init__()
self.n_mels = n_mels
# 音素编码器
self.phoneme_embedding = nn.Embedding(vocab_size, d_model)
self.encoder = TransformerEncoder(n_layers, d_model)
# 时长预测器
self.duration_predictor = DurationPredictor(d_model)
# 长度调节器
self.length_regulator = LengthRegulator()
# 梅尔解码器
self.decoder = TransformerEncoder(n_layers, d_model)
self.mel_linear = nn.Linear(d_model, n_mels)
# 韵律预测
self.pitch_predictor = PitchPredictor(d_model)
self.energy_predictor = EnergyPredictor(d_model)
def forward(self, phonemes, target_mel=None, durations=None,
pitch=None, energy=None):
# 编码
x = self.phoneme_embedding(phonemes)
x = self.encoder(x)
# 时长预测
if durations is None:
durations = self.duration_predictor(x) # (B, T_phoneme)
# 长度调节
x = self.length_regulator(x, durations) # (B, T_mel, D)
# 韵律调节
if pitch is None:
pitch = self.pitch_predictor(x)
if energy is None:
energy = self.energy_predictor(x)
x = x * (1 + pitch.unsqueeze(-1)) + energy.unsqueeze(-1)
# 解码
x = self.decoder(x)
mel_out = self.mel_linear(x) # (B, T_mel, n_mels)
return mel_out4.2 时长预测器
class DurationPredictor(nn.Module):
"""预测每个音素的持续时间(帧数)"""
def __init__(self, d_model):
super().__init__()
self.layers = nn.Sequential(
nn.Conv1d(d_model, d_model // 2, 3, padding=1),
nn.ReLU(),
nn.LayerNorm(d_model // 2),
nn.Dropout(0.1),
nn.Conv1d(d_model // 2, d_model // 2, 3, padding=1),
nn.ReLU(),
nn.LayerNorm(d_model // 2),
nn.Dropout(0.1),
nn.Conv1d(d_model // 2, 1, 1)
)
def forward(self, x):
# x: (B, T, D)
x = x.transpose(1, 2) # (B, D, T)
duration = self.layers(x).squeeze(1) # (B, T)
duration = F.softplus(duration) # 确保非负
return duration
class LengthRegulator(nn.Module):
"""根据时长调节序列长度"""
def forward(self, x, durations):
"""
x: (B, T, D) - 编码器输出
durations: (B, T) - 每个音素的帧数
"""
B, T, D = x.shape
# 展开
output = []
for b in range(B):
dur = durations[b] # (T,)
indices = []
for i, d in enumerate(dur):
indices.extend([i] * int(d.item()))
if len(indices) > 0:
expanded = x[b, indices] # (sum(dur), D)
else:
expanded = x[b: b+1, :1].expand(1, 1, D)
output.append(expanded)
# Pad to same length
max_len = max(out.size(0) for out in output)
padded = []
for out in output:
if out.size(0) < max_len:
pad = torch.zeros(max_len - out.size(0), D, device=out.device)
out = torch.cat([out, pad], dim=0)
padded.append(out)
return torch.stack(padded) # (B, max_len, D)5. VALL-E
5.1 音频LLM先驱
VALL-E是首个将语言建模方法应用于语音合成的模型:
- 音频Tokenization:使用AudioTokenizer将语音转换为离散token
- 条件生成:给定参考音频+文本,生成对应语音
- 自回归生成:像文本生成一样生成音频token
class VALL_E(nn.Module):
def __init__(self, vocab_size, dim=1024, n_heads=16, n_layers=24):
super().__init__()
# 音频量化器
self.audio_codec = load_pretrained_codec()
# 语义编码器(可选)
self.semantic_encoder = SemanticEncoder(dim)
# 自回归解码器
self.decoder = TransformerDecoder(
vocab_size=vocab_size,
d_model=dim,
nhead=n_heads,
n_layers=n_layers
)
# 参考编码器
self.ref_encoder = ReferenceEncoder(dim)
def forward(self, text, ref_audio, target_audio=None):
# 1. 编码参考音频
ref_tokens = self.audio_codec.encode(ref_audio) # (1, T_ref)
ref_emb = self.ref_encoder(ref_tokens) # (1, T_ref, dim)
# 2. 编码文本
text_emb = self.text_encoder(text) # (B, T_text, dim)
# 3. 自回归生成
if target_audio is not None:
# 训练:teacher forcing
target_tokens = self.audio_codec.encode(target_audio)
output = self.decoder(text_emb, target_tokens, ref_emb)
else:
# 推理:自回归生成
output = self.generate(text_emb, ref_emb)
# 4. 解码为波形
waveform = self.audio_codec.decode(output)
return waveform
@torch.no_grad()
def generate(self, text_emb, ref_emb, max_len=1000, temperature=1.0):
"""自回归生成"""
B = text_emb.size(0)
tokens = torch.full((B, 1), 0, dtype=torch.long, device=text_emb.device) # BOS
for _ in range(max_len):
# 获取解码器输出
logits = self.decoder(text_emb, tokens, ref_emb)
# 采样下一个token
probs = F.softmax(logits[:, -1] / temperature, dim=-1)
next_token = torch.multinomial(probs, 1)
tokens = torch.cat([tokens, next_token], dim=1)
# EOS检测
if (next_token == 1).all(): # EOS token
break
return tokens5.2 与文本LLM的对比
| 组件 | 文本LLM | VALL-E |
|---|---|---|
| Tokenization | BPE/WordPiece | RVQ音频codec |
| 词汇表 | ~50K | ~1024 |
| 上下文 | 文本序列 | 音频+文本序列 |
| 条件 | 无/前缀 | 参考音频+文本 |
6. 声码器
6.1 WaveNet
WaveNet是DeepMind提出的原始神经声码器:
class WaveNet(nn.Module):
def __init__(self, n_mels, n_residual_channels=512, n_skip_channels=256):
super().__init__()
self.conv1 = nn.Conv1d(n_mels, n_residual_channels, 3, padding=1)
# 残差块
self.residual_layers = nn.ModuleList([
ResidualBlock(n_residual_channels, n_skip_channels)
for _ in range(30)
])
# 输出层
self.conv2 = nn.Conv1d(n_skip_channels, 256, 1)
self.conv3 = nn.Conv1d(256, 256, 1)
self.fc = nn.Linear(256, 256) # 8-bit mu-law
def forward(self, mel, waveform=None):
"""
mel: (B, n_mels, T)
waveform: (B, 1, T) - 用于训练
"""
# 条件输入
h = self.conv1(mel)
# 残差连接
skip = 0
for layer in self.residual_layers:
h, s = layer(h)
skip += s
# 输出分布
h = F.relu(skip)
h = self.conv2(h)
h = F.relu(h)
h = self.conv3(h)
# 8-bit mu-law
logits = self.fc(h)
if waveform is not None:
# 训练:计算交叉熵损失
mu = 255
mu_expanded = mu ** 2 - 1
waveform_mu = ((waveform + 1) * mu / 2).long().clamp(0, mu - 1)
waveform_expanded = waveform_mu.float() / mu_expanded
target = ((torch.log(1 + mu_expanded * waveform_expanded.abs()) /
torch.log(torch.tensor(mu_expounded + 1))) * 2 - 1).long()
loss = F.cross_entropy(logits, target)
return loss
return logits6.2 HiFi-GAN
HiFi-GAN使用GAN实现高质量快速合成:
class HiFiGAN(nn.Module):
def __init__(self, n_mels=80, upsample_rates=[8, 8, 2, 2]):
super().__init__()
# 残差块
resblock_kernel_sizes = [3, 7, 11]
resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
# 编码器
self.conv_pre = nn.Conv1d(n_mels, 512, 7, 1, 3)
# 上采样层
self.upsamples = nn.ModuleList()
self.resblocks = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, [16, 16, 4])):
self.upsamples.append(
nn.ConvTranspose1d(512 // (2**i), 512 // (2**(i+1)), k, u, k // 2)
)
ch = 512 // (2**(i+1))
for k_i, d_i in zip(resblock_kernel_sizes, resblock_dilation_sizes):
self.resblocks.append(ResBlock(ch, k_i, d_i))
# 解码器
self.conv_post = nn.Conv1d(256, 1, 7, 1, 3)
def forward(self, mel):
h = self.conv_pre(mel)
for i, upsample in enumerate(self.upsamples):
h = F.leaky_relu(h, 0.2)
h = upsample(h)
# 应用残差块
for j in range(3):
resblock = self.resblocks[i * 3 + j]
h = resblock(h)
h = F.leaky_relu(h, 0.2)
h = self.conv_post(h)
h = torch.tanh(h)
return h
class ResBlock(nn.Module):
def __init__(self, channels, kernel_size, dilations):
super().__init__()
self.convs = nn.ModuleList()
for d in dilations:
self.convs.append(
nn.Sequential(
nn.LeakyReLU(0.2),
nn.Conv1d(channels, channels, kernel_size,
dilation=d, padding=d * (kernel_size - 1) // 2),
nn.LeakyReLU(0.2),
nn.Conv1d(channels, channels, kernel_size,
dilation=1, padding=(kernel_size - 1) // 2)
)
)
def forward(self, x):
for conv in self.convs:
h = conv(x)
x = h + x
return x7. 完整TTS系统
7.1 系统集成
class CompleteTTS(nn.Module):
"""完整的TTS系统"""
def __init__(self, text_processor, acoustic_model, vocoder):
super().__init__()
self.text_processor = text_processor
self.acoustic_model = acoustic_model
self.vocoder = vocoder
def forward(self, text, ref_audio=None, target_mel=None):
# 1. 文本处理
phonemes = self.text_processor.text_to_phonemes(text)
# 2. 声学模型
if isinstance(self.acoustic_model, FastSpeech):
mel = self.acoustic_model(phonemes)
elif isinstance(self.acoustic_model, VALL_E):
mel = self.acoustic_model(phonemes, ref_audio)
# 3. 声码器
waveform = self.vocoder(mel)
return waveform7.2 使用示例
import torch
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
# 加载模型
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# 准备输入
inputs = processor(text="Hello world, this is a test.", return_tensors="pt")
# 生成
from datasets import load_dataset
import datasets
# 加载说话人嵌入
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True)
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# 合成
speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings)
speech = vocoder(speech)
# 保存
import scipy.io.wavfile as wav
wav.write("output.wav", 16000, speech.cpu().detach().numpy())8. 评估指标
8.1 主观评估
| 指标 | 描述 | 范围 |
|---|---|---|
| MOS | 平均意见分 | 1-5 |
| CMOS | 对比MOS | -3 to +3 |
| ABX | 偏好测试 | A/B/X |
8.2 客观评估
def evaluate_tts(pred_wav, gt_wav, sr=22050):
"""TTS客观评估"""
metrics = {}
# 1. 梅尔倒谱距离 (MCD)
pred_mel = compute_mel(pred_wav, sr)
gt_mel = compute_mel(gt_wav, sr)
metrics['MCD'] = compute_mcd(pred_mel, gt_mel)
# 2. F0相关系数
pred_f0 = extract_f0(pred_wav, sr)
gt_f0 = extract_f0(gt_wav, sr)
metrics['F0_Corr'] = np.corrcoef(pred_f0, gt_f0)[0, 1]
# 3. 时长差异
metrics['Dur_Diff'] = abs(len(pred_wav) - len(gt_wav)) / len(gt_wav)
return metrics
def compute_mcd(pred_mel, gt_mel):
"""梅尔倒谱距离"""
# 对齐长度
min_len = min(pred_mel.shape[1], gt_mel.shape[1])
pred_mel = pred_mel[:, :min_len]
gt_mel = gt_mel[:, :min_len]
# 计算欧氏距离
diff = pred_mel - gt_mel
mcd = np.mean(np.sqrt(np.sum(diff ** 2, axis=0)))
return mcd9. 总结
核心要点
- Tacotron开创了端到端TTS,但自回归解码限制了速度
- FastSpeech引入非自回归生成,大幅提升速度且支持语速控制
- VALL-E开创音频LLM范式,将语言建模扩展到语音
- HiFi-GAN实现高质量实时合成,是当前主流声码器
发展趋势
- 音频LLM:VALL-E、AudioPaLM等将LLM技术扩展到语音
- 零样本克隆:给定短参考音频即可模仿任意说话人
- 多语言统一:一个模型支持多语言语音合成
- 情感控制:精确控制语音的情感和风格