第2088篇:AI应用监控体系——从埋点设计到Grafana看板的完整实践
2026/4/30大约 8 分钟
第2088篇:AI应用监控体系——从埋点设计到Grafana看板的完整实践
适读人群:负责AI应用运维和质量保障的工程师 | 阅读时长:约19分钟 | 核心价值:建立完整的AI应用可观测性体系,包括业务指标、技术指标、成本指标的采集和可视化
大多数工程师在AI应用上线后,监控都做得很简陋——顶多看看服务器的CPU和内存,偶尔看一眼错误日志。
但AI应用有一类独特的故障:服务正常运行,但回答质量悄悄变差。这类问题靠传统监控发现不了,等到用户投诉才知道。这篇文章讲怎么建立一套能发现这类问题的监控体系。
AI应用需要监控什么
埋点设计:统一的事件模型
/**
* AI请求的完整生命周期追踪
* 一个请求从进入到结束,产生一个完整的Event
*/
@Data
@Builder
public class AiRequestEvent {
// 追踪标识
private String requestId;
private String sessionId;
private String userId;
private String tenantId; // 多租户场景
// 功能标识
private String featureName; // 功能名称(如 "customer_support", "code_review")
private String intentName; // 意图名称(对话系统使用)
// 模型信息
private String provider; // "openai", "anthropic", etc.
private String modelName; // "gpt-4o", "claude-3-5-sonnet"
private boolean isFallback; // 是否是故障转移后的响应
// 时间指标
private LocalDateTime startTime;
private long ttftMs; // Time To First Token(流式场景)
private long totalLatencyMs;
// Token消耗
private int inputTokens;
private int outputTokens;
private int cachedTokens; // 来自缓存的token(语义缓存)
// 质量信号
private Boolean userThumbsUp; // 用户点赞
private Boolean userThumbsDown; // 用户踩
private boolean answerEmpty; // 回答为空
private boolean answerTruncated; // 回答被截断
private boolean safetyBlocked; // 被安全过滤拦截
// RAG信息
private int retrievedChunks; // 检索到的chunk数
private double avgRetrievalScore; // 平均检索相似度
// 错误信息
private String errorType; // "rate_limit", "timeout", "context_length", etc.
private String errorCode;
// 成本(预估)
private double estimatedCostUsd;
}埋点采集器
/**
* 统一的AI监控采集器
* 服务于Prometheus + Loki(结构化日志)
*/
@Component
@RequiredArgsConstructor
@Slf4j
public class AiMonitoringCollector {
private final MeterRegistry meterRegistry;
// 延迟直方图
private Timer requestLatencyTimer;
private Timer ttftTimer;
// 计数器
private Counter requestCounter;
private Counter errorCounter;
private Counter tokenCounter;
@PostConstruct
public void init() {
requestLatencyTimer = Timer.builder("ai.request.duration")
.description("AI请求端到端延迟")
.publishPercentiles(0.5, 0.9, 0.95, 0.99)
.register(meterRegistry);
ttftTimer = Timer.builder("ai.request.ttft")
.description("AI流式请求首Token延迟")
.publishPercentiles(0.5, 0.9, 0.99)
.register(meterRegistry);
}
/**
* 记录一次完整的AI请求事件
* 在请求链路结束后调用
*/
public void record(AiRequestEvent event) {
Tags baseTags = Tags.of(
"feature", event.getFeatureName() != null ? event.getFeatureName() : "unknown",
"provider", event.getProvider() != null ? event.getProvider() : "unknown",
"model", event.getModelName() != null ? event.getModelName() : "unknown",
"is_fallback", String.valueOf(event.isFallback())
);
// 1. 延迟指标
if (event.getTotalLatencyMs() > 0) {
meterRegistry.timer("ai.request.duration", baseTags)
.record(event.getTotalLatencyMs(), TimeUnit.MILLISECONDS);
}
if (event.getTtftMs() > 0) {
meterRegistry.timer("ai.request.ttft", baseTags)
.record(event.getTtftMs(), TimeUnit.MILLISECONDS);
}
// 2. 请求计数(区分成功/失败)
boolean isError = event.getErrorType() != null;
Tags requestTags = baseTags.and("success", String.valueOf(!isError));
meterRegistry.counter("ai.requests.total", requestTags).increment();
// 3. 错误分类计数
if (isError) {
meterRegistry.counter("ai.errors.total",
baseTags.and("error_type", event.getErrorType())).increment();
}
// 4. Token消耗
if (event.getInputTokens() > 0) {
meterRegistry.counter("ai.tokens.input.total", baseTags)
.increment(event.getInputTokens());
}
if (event.getOutputTokens() > 0) {
meterRegistry.counter("ai.tokens.output.total", baseTags)
.increment(event.getOutputTokens());
}
if (event.getCachedTokens() > 0) {
meterRegistry.counter("ai.tokens.cached.total", baseTags)
.increment(event.getCachedTokens());
}
// 5. 质量信号
if (event.isAnswerEmpty()) {
meterRegistry.counter("ai.quality.empty_answer", baseTags).increment();
}
if (event.isSafetyBlocked()) {
meterRegistry.counter("ai.quality.safety_blocked", baseTags).increment();
}
// 6. 用户反馈
if (Boolean.TRUE.equals(event.getUserThumbsUp())) {
meterRegistry.counter("ai.feedback.thumbs_up", baseTags).increment();
}
if (Boolean.TRUE.equals(event.getUserThumbsDown())) {
meterRegistry.counter("ai.feedback.thumbs_down", baseTags).increment();
}
// 7. 成本
if (event.getEstimatedCostUsd() > 0) {
meterRegistry.counter("ai.cost.usd.total", baseTags)
.increment(event.getEstimatedCostUsd());
}
// 8. 结构化日志(用于Loki/ELK搜索)
if (isError || event.getTotalLatencyMs() > 5000) {
// 错误和慢请求记录完整信息
log.warn("AI请求异常: requestId={}, feature={}, provider={}, " +
"latency={}ms, error={}, tokens={}/{}",
event.getRequestId(), event.getFeatureName(), event.getProvider(),
event.getTotalLatencyMs(), event.getErrorType(),
event.getInputTokens(), event.getOutputTokens());
} else {
log.debug("AI请求: requestId={}, latency={}ms, tokens={}/{}",
event.getRequestId(), event.getTotalLatencyMs(),
event.getInputTokens(), event.getOutputTokens());
}
}
/**
* 记录RAG检索质量
*/
public void recordRetrievalQuality(String featureName,
int chunkCount, double avgScore, String query) {
Tags tags = Tags.of("feature", featureName);
meterRegistry.summary("ai.rag.retrieved_chunks", tags)
.record(chunkCount);
meterRegistry.summary("ai.rag.retrieval_score", tags)
.record(avgScore);
if (chunkCount == 0) {
meterRegistry.counter("ai.rag.empty_retrieval", tags).increment();
log.warn("RAG检索无结果: feature={}, query={}", featureName,
query.substring(0, Math.min(50, query.length())));
}
if (avgScore < 0.6) {
meterRegistry.counter("ai.rag.low_score_retrieval", tags).increment();
}
}
}异步埋点:不影响主链路
/**
* 异步上报,避免监控代码影响AI响应延迟
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class AsyncAiMonitoringService {
private final AiMonitoringCollector collector;
// 专用的监控线程池,隔离于主业务
private final ExecutorService monitoringExecutor = Executors.newFixedThreadPool(
2, r -> {
Thread t = new Thread(r, "ai-monitoring");
t.setDaemon(true);
return t;
}
);
// 失败事件缓冲(防止埋点失败丢失)
private final BlockingQueue<AiRequestEvent> eventBuffer = new LinkedBlockingQueue<>(10000);
/**
* 异步上报(不阻塞调用线程)
*/
public void reportAsync(AiRequestEvent event) {
boolean offered = eventBuffer.offer(event);
if (!offered) {
log.warn("监控事件缓冲区已满,丢弃事件: requestId={}", event.getRequestId());
}
}
@PostConstruct
public void startWorker() {
monitoringExecutor.submit(this::processEvents);
}
private void processEvents() {
while (!Thread.currentThread().isInterrupted()) {
try {
AiRequestEvent event = eventBuffer.poll(1, TimeUnit.SECONDS);
if (event != null) {
collector.record(event);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
} catch (Exception e) {
log.error("监控事件处理失败: {}", e.getMessage());
}
}
}
}与业务代码集成:AOP拦截
/**
* AOP拦截AI服务方法,自动采集监控数据
* 避免在每个方法里手动写埋点代码
*/
@Aspect
@Component
@RequiredArgsConstructor
@Slf4j
public class AiMonitoringAspect {
private final AsyncAiMonitoringService monitoringService;
private final LlmCostCalculator costCalculator;
/**
* 拦截所有带@AiFeature注解的方法
*/
@Around("@annotation(aiFeature)")
public Object monitorAiFeature(ProceedingJoinPoint pjp, AiFeature aiFeature) throws Throwable {
String requestId = UUID.randomUUID().toString().substring(0, 8);
long startTime = System.currentTimeMillis();
AiRequestEvent.AiRequestEventBuilder eventBuilder = AiRequestEvent.builder()
.requestId(requestId)
.featureName(aiFeature.value())
.startTime(LocalDateTime.now());
try {
Object result = pjp.proceed();
// 从结果中提取监控信息(如果有)
if (result instanceof MonitorableResponse mr) {
eventBuilder
.provider(mr.getProvider())
.modelName(mr.getModelName())
.inputTokens(mr.getInputTokens())
.outputTokens(mr.getOutputTokens())
.estimatedCostUsd(costCalculator.calculate(
mr.getProvider(), mr.getInputTokens(), mr.getOutputTokens()));
}
return result;
} catch (ProviderRateLimitException e) {
eventBuilder.errorType("rate_limit").errorCode("429");
throw e;
} catch (ProviderException e) {
eventBuilder.errorType("provider_error");
throw e;
} catch (Exception e) {
eventBuilder.errorType("unknown_error");
throw e;
} finally {
eventBuilder.totalLatencyMs(System.currentTimeMillis() - startTime);
monitoringService.reportAsync(eventBuilder.build());
}
}
}
/**
* 标注AI功能方法的注解
*/
@Target(ElementType.METHOD)
@Retention(RetentionPolicy.RUNTIME)
public @interface AiFeature {
String value(); // 功能名称
}实时告警规则
/**
* 基于指标的实时告警
* 检测AI质量突变
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class AiQualityAlertService {
private final MeterRegistry meterRegistry;
private final AlertNotificationService alertService;
// 各指标的告警阈值
private static final double MAX_ERROR_RATE = 0.05; // 5%错误率
private static final double MAX_EMPTY_ANSWER_RATE = 0.02; // 2%空回答率
private static final long MAX_P99_LATENCY_MS = 8000; // P99 8秒
private static final double MIN_FEEDBACK_SATISFACTION = 0.80; // 80%满意度
/**
* 定期检查关键指标,触发告警
*/
@Scheduled(fixedDelay = 60000) // 每分钟检查一次
public void checkAlerts() {
checkErrorRate();
checkLatency();
checkUserSatisfaction();
checkEmptyAnswerRate();
}
private void checkErrorRate() {
// 从Prometheus拿最近5分钟的错误率
double errorRate = calculateRecentErrorRate(5);
if (errorRate > MAX_ERROR_RATE) {
alertService.sendAlert(Alert.builder()
.severity(Alert.Severity.WARNING)
.title("AI服务错误率过高")
.message(String.format("最近5分钟错误率=%.1f%%,超过阈值%.1f%%",
errorRate * 100, MAX_ERROR_RATE * 100))
.metric("ai.error_rate", errorRate)
.build());
}
}
private void checkLatency() {
// 注意:Micrometer的percentile需要配置HistogramSnapshot
Timer timer = meterRegistry.find("ai.request.duration").timer();
if (timer == null) return;
double p99Ms = timer.percentile(0.99, TimeUnit.MILLISECONDS);
if (p99Ms > MAX_P99_LATENCY_MS) {
alertService.sendAlert(Alert.builder()
.severity(Alert.Severity.WARNING)
.title("AI请求P99延迟过高")
.message(String.format("P99延迟=%.0fms,超过阈值%dms", p99Ms, MAX_P99_LATENCY_MS))
.metric("ai.latency.p99", p99Ms)
.build());
}
}
private void checkUserSatisfaction() {
double thumbsUp = getCounterValue("ai.feedback.thumbs_up");
double thumbsDown = getCounterValue("ai.feedback.thumbs_down");
double total = thumbsUp + thumbsDown;
if (total < 10) return; // 样本量太少,不告警
double satisfaction = thumbsUp / total;
if (satisfaction < MIN_FEEDBACK_SATISFACTION) {
alertService.sendAlert(Alert.builder()
.severity(Alert.Severity.WARNING)
.title("AI用户满意度下降")
.message(String.format("用户满意度=%.1f%%(%d赞/%d踩),低于阈值%.1f%%",
satisfaction * 100, (long)thumbsUp, (long)thumbsDown,
MIN_FEEDBACK_SATISFACTION * 100))
.metric("ai.satisfaction", satisfaction)
.build());
}
}
private void checkEmptyAnswerRate() {
double emptyAnswers = getCounterValue("ai.quality.empty_answer");
double totalRequests = getCounterValue("ai.requests.total");
if (totalRequests < 50) return;
double emptyRate = emptyAnswers / totalRequests;
if (emptyRate > MAX_EMPTY_ANSWER_RATE) {
alertService.sendAlert(Alert.builder()
.severity(Alert.Severity.ERROR)
.title("AI空回答率异常")
.message(String.format("空回答率=%.1f%%,超过阈值%.1f%%",
emptyRate * 100, MAX_EMPTY_ANSWER_RATE * 100))
.metric("ai.empty_answer_rate", emptyRate)
.build());
}
}
private double calculateRecentErrorRate(int minutes) {
// 实际实现需要用PromQL或在内存中维护滑动窗口
// 这里简化为从Counter计算
double errors = getCounterValue("ai.errors.total");
double total = getCounterValue("ai.requests.total");
return total > 0 ? errors / total : 0;
}
private double getCounterValue(String counterName) {
Counter counter = meterRegistry.find(counterName).counter();
return counter != null ? counter.count() : 0;
}
}Grafana看板核心面板设计
以下是关键Grafana面板的PromQL查询,可以直接用于配置:
# Grafana看板面板配置
panels:
- title: "AI请求量(QPM)"
query: "rate(ai_requests_total[1m]) * 60"
type: "time_series"
- title: "请求错误率"
query: |
rate(ai_requests_total{success='false'}[5m])
/
rate(ai_requests_total[5m])
type: "stat"
thresholds: [0.02, 0.05] # 绿/黄/红
- title: "延迟分布(P50/P90/P99)"
queries:
- label: "P50": "histogram_quantile(0.5, rate(ai_request_duration_bucket[5m]))"
- label: "P90": "histogram_quantile(0.9, rate(ai_request_duration_bucket[5m]))"
- label: "P99": "histogram_quantile(0.99, rate(ai_request_duration_bucket[5m]))"
type: "time_series"
- title: "Token消耗(按提供商)"
query: "sum(rate(ai_tokens_input_total[5m])) by (provider)"
type: "bar_chart"
- title: "用户满意度"
query: |
sum(ai_feedback_thumbs_up_total)
/
(sum(ai_feedback_thumbs_up_total) + sum(ai_feedback_thumbs_down_total))
type: "stat"
unit: "percentunit"
thresholds: [0.7, 0.85]
- title: "每日AI成本趋势(美元)"
query: "sum(increase(ai_cost_usd_total[1d]))"
type: "time_series"
- title: "空检索率(RAG质量)"
query: |
rate(ai_rag_empty_retrieval_total[5m])
/
rate(ai_requests_total{feature=~".*rag.*"}[5m])
type: "stat"
thresholds: [0.05, 0.10]
- title: "提供商分布(故障切换视图)"
query: "sum(rate(ai_requests_total[5m])) by (provider, is_fallback)"
type: "pie_chart"自定义质量分看板
/**
* 实时质量分计算(用于Grafana的custom metric)
* 综合多个信号,输出一个0-100的综合质量分
*/
@Component
@RequiredArgsConstructor
public class AiQualityScoreGauge {
private final MeterRegistry meterRegistry;
@PostConstruct
public void registerGauge() {
// 注册一个Gauge,定期计算并更新质量分
Gauge.builder("ai.quality.composite_score", this,
AiQualityScoreGauge::calculateScore)
.description("AI综合质量分(0-100)")
.register(meterRegistry);
}
private double calculateScore() {
double errorRate = getSafeRate("ai.errors.total", "ai.requests.total");
double emptyRate = getSafeRate("ai.quality.empty_answer", "ai.requests.total");
double satisfactionRate = calculateSatisfactionRate();
// 加权计算质量分
double score = 100;
score -= errorRate * 500; // 1%错误率扣5分
score -= emptyRate * 300; // 1%空回答率扣3分
score -= (1 - satisfactionRate) * 200; // 不满意率10%扣2分
return Math.max(0, Math.min(100, score));
}
private double getSafeRate(String numerator, String denominator) {
Counter num = meterRegistry.find(numerator).counter();
Counter den = meterRegistry.find(denominator).counter();
if (num == null || den == null || den.count() == 0) return 0;
return num.count() / den.count();
}
private double calculateSatisfactionRate() {
Counter up = meterRegistry.find("ai.feedback.thumbs_up").counter();
Counter down = meterRegistry.find("ai.feedback.thumbs_down").counter();
if (up == null || down == null) return 1.0;
double total = up.count() + down.count();
return total > 0 ? up.count() / total : 1.0;
}
}监控体系建立后,最重要的是设置告警阈值后要认真处理每一个告警。很多团队把告警配了但不处理,久了变成"狼来了"——真正的问题出现时也没人关注。
AI应用特有的一个监控窗口期是上线后的前48小时:这时候用户行为最多样,很多edge case会在这段时间集中出现。这个时期要把告警阈值设严格一些,宁可多几个误报,也要确保不漏掉真实问题。
