音频事件检测与理解
1. 概述
音频事件检测(Audio Event Detection, AED)和理解是音频信号处理的核心任务,旨在识别音频中的声音事件类型、时间边界和语义内容。
1.1 任务分类
音频事件检测与理解
├── 音频场景分类
│ ├── 室内/室外
│ ├── 城市/自然
│ └── 嘈杂/安静
├── 声音事件检测 (SED)
│ ├── 音乐
│ ├── 语音
│ ├── 音效
│ └── 环境音
├── 声纹识别
│ ├── 说话人验证
│ └── 说话人识别
└── 音频异常检测
├── 机械故障
├── 医疗异常
└── 安全监控
1.2 典型数据集
| 数据集 | 描述 | 规模 | 任务 |
|---|---|---|---|
| AudioSet | YouTube音频片段 | 2M+ | 多标签事件 |
| ESC-50 | 环境声音分类 | 2K | 50类场景 |
| UrbanSound8K | 城市声音 | 8.7K | 10类城市声音 |
| VoxCeleb | 说话人识别 | 1M+ | 说话人验证 |
| DCASE | 声学场景/事件 | 挑战赛 | 每年更新 |
2. 音频场景分类
2.1 基于CNN的方法
class AudioSceneClassifier(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
# 特征提取
self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
# 池化
self.pool = nn.AdaptiveAvgPool2d((1, 1))
# 分类器
self.fc = nn.Sequential(
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(64, num_classes)
)
def forward(self, mel_spectrogram):
"""
mel_spectrogram: (B, 1, n_mels, T)
"""
# 卷积特征提取
x = F.relu(self.conv1(mel_spectrogram))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv3(x))
x = self.pool(x) # (B, 128, 1, 1)
# 分类
x = x.view(x.size(0), -1)
logits = self.fc(x)
return logits2.2 基于Transformer的方法
class AudioSceneTransformer(nn.Module):
def __init__(self, n_mels=128, num_classes=10, d_model=256, n_layers=6):
super().__init__()
# 输入投影
self.projection = nn.Linear(n_mels, d_model)
# 类别token
self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
# 位置编码
self.pos_embed = nn.Parameter(torch.randn(1, 100, d_model))
# Transformer
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model=d_model,
nhead=8,
dim_feedforward=1024,
dropout=0.1
),
num_layers=n_layers
)
# 分类头
self.fc = nn.Linear(d_model, num_classes)
def forward(self, mel_spectrogram):
"""
mel_spectrogram: (B, n_mels, T)
"""
B, n_mels, T = mel_spectrogram.shape
# 转置和投影
x = mel_spectrogram.transpose(1, 2) # (B, T, n_mels)
x = self.projection(x) # (B, T, d_model)
# 添加类别token
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat([cls_tokens, x], dim=1) # (B, T+1, d_model)
# 添加位置编码
x = x + self.pos_embed[:, :x.size(1)]
# Transformer
x = self.transformer(x)
# 分类
cls_output = x[:, 0] # 类别token
logits = self.fc(cls_output)
return logits3. 声音事件检测 (SED)
3.1 多标签分类
SED通常是多标签分类问题:
class SoundEventDetector(nn.Module):
def __init__(self, n_mels=128, num_classes=527):
super().__init__()
# 特征提取器
self.backbone = nn.Sequential(
# 频谱卷积
nn.Conv2d(1, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.AdaptiveAvgPool2d((1, None))
)
# 时序建模
self.rnn = nn.LSTM(
64, 128,
num_layers=2,
batch_first=True,
bidirectional=True
)
# 分类头
self.classifier = nn.Sequential(
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, num_classes)
)
def forward(self, mel_spectrogram, mask=None):
"""
mel_spectrogram: (B, 1, n_mels, T)
Returns: (B, T, num_classes) - 每帧的事件概率
"""
# 特征提取
x = self.backbone(mel_spectrogram) # (B, 64, 1, T)
x = x.squeeze(2) # (B, 64, T)
# 时序建模
x = x.transpose(1, 2) # (B, T, 64)
x, _ = self.rnn(x) # (B, T, 256)
# 分类
logits = self.classifier(x) # (B, T, num_classes)
if mask is not None:
logits = logits * mask.unsqueeze(-1)
return logits3.2 训练损失
def sed_loss(predictions, targets, threshold=0.5):
"""
predictions: (B, T, num_classes) - sigmoid前的logits
targets: (B, T, num_classes) - 二进制标签
"""
# BCE损失
probs = torch.sigmoid(predictions)
loss = F.binary_cross_entropy(probs, targets)
return loss
def get_sed_metrics(predictions, targets, threshold=0.5):
"""计算SED评估指标"""
probs = torch.sigmoid(predictions)
preds = (probs > threshold).float()
# 逐样本计算
metrics = {}
# 精确率
precision = (preds * targets).sum() / (preds.sum() + 1e-8)
metrics['precision'] = precision.item()
# 召回率
recall = (preds * targets).sum() / (targets.sum() + 1e-8)
metrics['recall'] = recall.item()
# F1
metrics['f1'] = 2 * precision * recall / (precision + recall + 1e-8)
return metrics3.3 时序后处理
class TemporalPostProcessor:
"""时序后处理:平滑和阈值"""
def __init__(self, sigma=0.5, threshold=0.5, min_duration=3):
self.sigma = sigma
self.threshold = threshold
self.min_duration = min_duration
def process(self, predictions):
"""
predictions: (T, num_classes) - 每帧概率
"""
T, num_classes = predictions.shape
events = []
for c in range(num_classes):
# 高斯平滑
probs = predictions[:, c]
probs_smoothed = gaussian_filter1d(probs, sigma=self.sigma)
# 阈值化
binary = probs_smoothed > self.threshold
# 提取事件段
segments = self.extract_segments(binary)
# 过滤短片段
segments = [s for s in segments if s[1] - s[0] >= self.min_duration]
events.append(segments)
return events
def extract_segments(self, binary_mask):
"""提取连续段"""
segments = []
start = None
for t, v in enumerate(binary_mask):
if v and start is None:
start = t
elif not v and start is not None:
segments.append((start, t))
start = None
if start is not None:
segments.append((start, len(binary_mask)))
return segments4. 声纹识别
4.1 说话人验证
class SpeakerVerification(nn.Module):
def __init__(self, embedding_dim=256):
super().__init__()
# 说话人编码器
self.encoder = nn.Sequential(
# 帧级特征
nn.Conv1d(80, 512, 5, padding=2),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Conv1d(512, 512, 3, padding=1),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Conv1d(512, 512, 3, padding=1),
)
# Attentive统计池化
self.pooling = AttentiveStatisticsPooling(512)
# 说话人嵌入
self.fc = nn.Sequential(
nn.Linear(1024, embedding_dim), # 512*2 for mean+std
nn.BatchNorm1d(embedding_dim)
)
# 分类头 (可选,用于训练)
self.classifier = nn.Linear(embedding_dim, 1211) # 训练时的类别数
def forward(self, mel_spectrogram, return_embedding=True):
"""
mel_spectrogram: (B, 80, T)
"""
# 编码
x = self.encoder(mel_spectrogram) # (B, 512, T)
# Attentive池化
x = self.pooling(x) # (B, 512*2)
# 嵌入
embedding = self.fc(x) # (B, embedding_dim)
embedding = F.normalize(embedding, p=2, dim=1)
if return_embedding:
return embedding
else:
return self.classifier(embedding)
class AttentiveStatisticsPooling(nn.Module):
"""Attentive统计池化"""
def forward(self, x):
"""
x: (B, C, T)
"""
# 计算注意力权重
attention = torch.sum(torch.abs(x), dim=1, keepdim=True) # (B, 1, T)
attention = F.softmax(attention, dim=-1) # (B, 1, T)
# 加权统计
mean = torch.sum(x * attention, dim=-1) # (B, C)
std = torch.sqrt(
torch.sum((x ** 2) * attention, dim=-1) - mean ** 2 + 1e-8
) # (B, C)
# 拼接
output = torch.cat([mean, std], dim=1) # (B, 2C)
return output4.2 对比损失训练
class ContrastiveSpeakerLoss(nn.Module):
"""对比损失:同类样本接近,异类样本远离"""
def __init__(self, margin=0.2):
super().__init__()
self.margin = margin
def forward(self, embeddings, labels):
"""
embeddings: (B, D)
labels: (B,) - 说话人ID
"""
# 计算相似度矩阵
similarity = torch.matmul(embeddings, embeddings.t()) # (B, B)
# 构建正负样本掩码
labels = labels.view(-1, 1)
positive_mask = (labels == labels.t()).float()
negative_mask = (labels != labels.t()).float()
# 正样本对距离
positive_loss = (1 - similarity) * positive_mask
positive_loss = positive_loss.sum() / (positive_mask.sum() + 1e-8)
# 负样本对距离
negative_loss = F.relu(similarity - self.margin) * negative_mask
negative_loss = negative_loss.sum() / (negative_mask.sum() + 1e-8)
return positive_loss + negative_loss4.3 说话人验证推理
class SpeakerVerificationSystem:
def __init__(self, threshold=0.7):
self.model = SpeakerVerification()
self.threshold = threshold
def verify(self, enrollment_audio, test_audio):
"""验证两个音频是否来自同一说话人"""
# 提取嵌入
enrollment_emb = self.model(enrollment_audio)
test_emb = self.model(test_audio)
# 计算余弦相似度
similarity = F.cosine_similarity(enrollment_emb, test_emb)
# 决策
is_same = similarity > self.threshold
return {
'is_same_speaker': is_same.item(),
'confidence': similarity.item()
}5. 音频异常检测
5.1 基于重构的方法
class AudioAnomalyDetector(nn.Module):
"""基于自编码器的异常检测"""
def __init__(self, input_dim=80):
super().__init__()
# 编码器
self.encoder = nn.Sequential(
nn.Linear(input_dim, 256),
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, 64), # 瓶颈
)
# 解码器
self.decoder = nn.Sequential(
nn.Linear(64, 128),
nn.ReLU(),
nn.Linear(128, 256),
nn.ReLU(),
nn.Linear(256, input_dim),
)
def forward(self, x):
"""x: (B, T, input_dim)"""
# 展平
B, T, D = x.shape
x_flat = x.view(-1, D)
# 编码
z = self.encoder(x_flat)
# 解码
x_recon = self.decoder(z)
# 重构误差
recon_error = F.mse_loss(x_recon, x_flat, reduction='none').sum(dim=-1)
recon_error = recon_error.view(B, T)
return recon_error
def detect(self, audio, threshold=None):
"""检测异常"""
with torch.no_grad():
error = self(audio)
if threshold is None:
# 使用训练集的统计量
threshold = self.threshold_
is_anomaly = error > threshold
anomaly_score = error
return is_anomaly, anomaly_score5.2 基于分布的方法
class GaussianMixtureAnomalyDetector:
"""基于高斯混合模型的异常检测"""
def __init__(self, n_components=5):
self.n_components = n_components
self.gmm = GaussianMixture(n_components)
self.threshold = None
def fit(self, normal_audio_features):
"""在正常样本上训练"""
# 训练GMM
self.gmm.fit(normal_audio_features)
# 计算正常样本的似然
log_likelihood = self.gmm.score_samples(normal_audio_features)
# 设置阈值(正常样本似然的某个分位数)
self.threshold = np.percentile(log_likelihood, 5)
def predict(self, audio_features):
"""预测异常"""
log_likelihood = self.gmm.score_samples(audio_features)
is_anomaly = log_likelihood < self.threshold
return is_anomaly, -log_likelihood # 异常分数(越大越异常)5.3 工业异常检测应用
class IndustrialAudioAnomaly:
"""工业设备音频异常检测"""
def __init__(self):
self.detector = AudioAnomalyDetector()
# 正常音频数据库
self.normal_templates = {}
def enroll(self, equipment_id, normal_audio):
"""注册正常音频模板"""
with torch.no_grad():
features = self.extract_features(normal_audio)
self.normal_templates[equipment_id] = features.mean(0)
def inspect(self, equipment_id, test_audio):
"""检测设备是否异常"""
with torch.no_grad():
features = self.extract_features(test_audio)
# 计算与模板的差异
template = self.normal_templates.get(equipment_id)
if template is None:
return {'status': 'unknown', 'confidence': 0}
distance = F.mse_loss(features, template.unsqueeze(0)).item()
# 决策
is_anomaly = distance > self.threshold
return {
'status': 'anomaly' if is_anomaly else 'normal',
'confidence': min(distance / self.threshold, 1.0),
'distance': distance
}
def extract_features(self, audio):
"""提取音频特征"""
# MFCC或Log-Mel
mel = librosa.feature.melspectrogram(
audio, sr=16000, n_mels=80, hop_length=512
)
features = librosa.power_to_db(mel).T
return torch.tensor(features).float()6. 实践指南
6.1 使用 librosa 提取特征
import librosa
import numpy as np
def extract_audio_features(audio_path):
"""提取多种音频特征"""
# 加载音频
y, sr = librosa.load(audio_path, sr=22050)
features = {}
# 1. MFCC
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
features['mfcc_mean'] = mfcc.mean(axis=1)
features['mfcc_std'] = mfcc.std(axis=1)
# 2. 色度特征
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
features['chroma_mean'] = chroma.mean(axis=1)
# 3. 频谱质心
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
features['spectral_centroid_mean'] = spectral_centroid.mean()
# 4. 零交叉率
zero_crossing = librosa.feature.zero_crossing_rate(y)
features['zcr_mean'] = zero_crossing.mean()
# 5. 梅尔频谱
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
log_mel = librosa.power_to_db(mel_spec)
features['mel_mean'] = log_mel.mean(axis=1)
return features6.2 音频事件检测完整流程
from datasets import load_dataset
# 加载数据集
dataset = load_dataset("d case/2024_task2", split="train")
# 创建模型
model = SoundEventDetector(num_classes=len(dataset.features['label'].names))
# 训练
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
for epoch in range(30):
for batch in dataset:
# 准备数据
mel = torch.tensor(batch['mel_spectrogram']).unsqueeze(0)
label = torch.tensor(batch['labels']).unsqueeze(0)
# 前向
logits = model(mel)
# 损失
loss = sed_loss(logits, label)
# 反向
loss.backward()
optimizer.step()
optimizer.zero_grad()7. 评估指标
7.1 场景分类评估
def evaluate_scene_classification(predictions, targets):
"""场景分类评估"""
preds = torch.argmax(predictions, dim=-1)
targets = targets
# 准确率
accuracy = (preds == targets).float().mean()
# 混淆矩阵
confusion = torch.zeros(num_classes, num_classes)
for p, t in zip(preds, targets):
confusion[p, t] += 1
# 各类别精确率和召回率
per_class_metrics = {}
for c in range(num_classes):
tp = confusion[c, c]
fp = confusion[:, c].sum() - tp
fn = confusion[c, :].sum() - tp
precision = tp / (tp + fp + 1e-8)
recall = tp / (tp + fn + 1e-8)
f1 = 2 * precision * recall / (precision + recall + 1e-8)
per_class_metrics[c] = {
'precision': precision.item(),
'recall': recall.item(),
'f1': f1.item()
}
return {
'accuracy': accuracy.item(),
'per_class': per_class_metrics
}7.2 SED评估
def evaluate_sed(predictions, targets, thresholds=[0.3, 0.5, 0.7]):
"""声音事件检测评估"""
results = {}
for thresh in thresholds:
# 二值化
preds = (predictions > thresh).float()
# 计算指标
tp = (preds * targets).sum().item()
fp = (preds * (1 - targets)).sum().item()
fn = ((1 - preds) * targets).sum().item()
precision = tp / (tp + fp + 1e-8)
recall = tp / (tp + fn + 1e-8)
f1 = 2 * precision * recall / (precision + recall + 1e-8)
results[f'thresh_{thresh}'] = {
'precision': precision,
'recall': recall,
'f1': f1
}
return results8. 总结
核心要点
- 音频场景分类使用CNN或Transformer提取时频特征
- 声音事件检测是多标签时序分类,需要时序建模和后处理
- 声纹识别使用对比学习或度量学习训练说话人嵌入
- 异常检测基于重构误差或分布偏移
未来趋势
- 自监督学习:利用大规模无标注音频
- 多任务学习:联合学习多个音频理解任务
- 跨域适应:从一个数据集迁移到另一个
- 端到端优化:从原始波形到任务输出