第2055篇:AI应用可观测性——追踪LLM调用链路的完整方案
2026/4/30大约 5 分钟
第2055篇:AI应用可观测性——追踪LLM调用链路的完整方案
适读人群:负责AI应用运维和质量保障的工程师 | 阅读时长:约19分钟 | 核心价值:建立LLM应用的完整可观测性体系,快速定位问题和优化性能
上线第一周,用户反馈"AI回答很慢"。
我登上去查日志,发现日志里只有"LLM调用成功",没有任何关于哪个阶段慢的信息。根本没办法定位问题。
这是AI应用可观测性不够的典型问题。LLM应用的调用链路比普通接口复杂——RAG检索、Prompt组装、LLM调用、后处理——每个阶段都可能是瓶颈,必须有完整的追踪。
LLM应用的可观测性模型
结构化日志
/**
* LLM调用的结构化日志
* 使用结构化格式(JSON),方便日志聚合和分析
*/
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmCallLogger {
private final ObjectMapper objectMapper;
/**
* 记录完整的LLM调用信息
*/
public void logLlmCall(LlmCallRecord record) {
try {
// 结构化日志:使用MDC + JSON格式
String json = objectMapper.writeValueAsString(record);
if (record.isSuccess()) {
log.info("LLM_CALL {}", json);
} else {
log.warn("LLM_CALL_FAILED {}", json);
}
} catch (JsonProcessingException e) {
log.error("记录LLM调用日志失败", e);
}
}
@Data
@Builder
public static class LlmCallRecord {
private String traceId;
private String spanId;
private String userId;
private String sessionId;
private String modelName;
// Prompt信息(注意脱敏)
private String systemPromptHash; // System Prompt的哈希(不记录原文,防止泄露)
private String userInputPreview; // 用户输入的前100字符
private int userInputLength;
// Token使用
private int inputTokens;
private int outputTokens;
private int totalTokens;
// 性能
private long retrievalDurationMs; // RAG检索耗时
private long promptAssemblyMs; // Prompt组装耗时
private long llmCallDurationMs; // LLM调用耗时
private long totalDurationMs; // 总耗时
// 结果
private boolean success;
private String errorType;
private String errorMessage;
// RAG相关
private int retrievedDocCount;
private double topSimilarityScore;
// 成本估算
private double estimatedCost;
private long timestamp;
}
}分布式追踪集成
/**
* 集成OpenTelemetry的LLM追踪
* 可以把LLM调用链路展示在Jaeger/Zipkin等追踪系统中
*/
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmTracingService {
private final Tracer tracer; // OpenTelemetry Tracer
/**
* 创建LLM调用的追踪Span
*/
public <T> T traceRagQuery(String operationName,
String query,
Supplier<T> operation) {
Span span = tracer.spanBuilder("llm." + operationName)
.setAttribute("llm.query", truncate(query, 200))
.setAttribute("llm.operation", operationName)
.startSpan();
try (Scope scope = span.makeCurrent()) {
T result = operation.get();
span.setStatus(StatusCode.OK);
return result;
} catch (Exception e) {
span.setStatus(StatusCode.ERROR, e.getMessage());
span.recordException(e);
throw e;
} finally {
span.end();
}
}
/**
* RAG流程的完整追踪
*/
public String tracedRagQuery(String userId, String query,
RagQueryExecutor executor) {
Span rootSpan = tracer.spanBuilder("rag.query")
.setAttribute("user.id", userId)
.setAttribute("query.length", query.length())
.startSpan();
try (Scope rootScope = rootSpan.makeCurrent()) {
// 追踪:向量检索
List<String> contexts = traceRagQuery("retrieval", query, () -> {
Span retrievalSpan = tracer.spanBuilder("rag.retrieval")
.startSpan();
try (Scope s = retrievalSpan.makeCurrent()) {
List<String> docs = executor.retrieve(query);
retrievalSpan.setAttribute("retrieved.count", docs.size());
return docs;
} finally {
retrievalSpan.end();
}
});
// 追踪:LLM调用
String answer = traceRagQuery("generation", query, () -> {
Span generationSpan = tracer.spanBuilder("rag.generation")
.setAttribute("context.count", contexts.size())
.startSpan();
try (Scope s = generationSpan.makeCurrent()) {
String result = executor.generate(query, contexts);
generationSpan.setAttribute("answer.length", result.length());
return result;
} finally {
generationSpan.end();
}
});
rootSpan.setStatus(StatusCode.OK);
return answer;
} catch (Exception e) {
rootSpan.setStatus(StatusCode.ERROR, e.getMessage());
rootSpan.recordException(e);
throw e;
} finally {
rootSpan.end();
}
}
private String truncate(String text, int maxLength) {
return text.length() > maxLength ? text.substring(0, maxLength) + "..." : text;
}
public interface RagQueryExecutor {
List<String> retrieve(String query);
String generate(String query, List<String> contexts);
}
}Prometheus指标收集
/**
* LLM应用的Prometheus指标
* 配合Grafana做可视化监控
*/
@Component
@RequiredArgsConstructor
public class LlmMetricsCollector {
// 计数器
private final Counter llmCallTotal;
private final Counter llmCallErrors;
private final Counter tokenInputTotal;
private final Counter tokenOutputTotal;
// 直方图(用于计算百分位延迟)
private final DistributionSummary llmCallDuration;
private final DistributionSummary retrievalDuration;
// Gauge(当前状态)
private final AtomicDouble activeSessionCount = new AtomicDouble(0);
public LlmMetricsCollector(MeterRegistry registry) {
this.llmCallTotal = Counter.builder("llm_calls_total")
.description("LLM调用总次数")
.register(registry);
this.llmCallErrors = Counter.builder("llm_call_errors_total")
.description("LLM调用失败次数")
.register(registry);
this.tokenInputTotal = Counter.builder("llm_tokens_input_total")
.description("输入Token总量")
.register(registry);
this.tokenOutputTotal = Counter.builder("llm_tokens_output_total")
.description("输出Token总量")
.register(registry);
this.llmCallDuration = DistributionSummary.builder("llm_call_duration_ms")
.description("LLM调用耗时(毫秒)")
.percentilePrecision(2)
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
this.retrievalDuration = DistributionSummary.builder("rag_retrieval_duration_ms")
.description("RAG检索耗时(毫秒)")
.percentilePrecision(2)
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
Gauge.builder("llm_active_sessions", activeSessionCount, AtomicDouble::get)
.description("当前活跃会话数")
.register(registry);
}
public void recordLlmCall(LlmCallRecord record) {
llmCallTotal.increment();
if (!record.isSuccess()) {
llmCallErrors.increment();
}
tokenInputTotal.increment(record.getInputTokens());
tokenOutputTotal.increment(record.getOutputTokens());
llmCallDuration.record(record.getLlmCallDurationMs());
if (record.getRetrievalDurationMs() > 0) {
retrievalDuration.record(record.getRetrievalDurationMs());
}
}
public void incrementActiveSessions() {
activeSessionCount.addAndGet(1);
}
public void decrementActiveSessions() {
activeSessionCount.addAndGet(-1);
}
}Grafana监控大盘配置
重要的监控指标及其告警阈值:
# grafana-dashboard-alerts.yaml
# 核心指标告警配置
alerts:
- name: "LLM调用P99延迟过高"
query: "histogram_quantile(0.99, llm_call_duration_ms)"
threshold: 10000 # 超过10秒告警
severity: "critical"
- name: "LLM错误率过高"
query: "rate(llm_call_errors_total[5m]) / rate(llm_calls_total[5m])"
threshold: 0.05 # 超过5%告警
severity: "warning"
- name: "Token消耗速率异常"
query: "rate(llm_tokens_input_total[1h]) > 1000000" # 每小时超100万Token
threshold: 1000000
severity: "warning"
- name: "RAG检索耗时过长"
query: "histogram_quantile(0.95, rag_retrieval_duration_ms)"
threshold: 2000 # P95超过2秒
severity: "warning"完整的可观测性集成示例
/**
* 把日志、指标、追踪集成到一个AOP切面
*/
@Aspect
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmObservabilityAspect {
private final LlmCallLogger callLogger;
private final LlmMetricsCollector metricsCollector;
private final LlmTracingService tracingService;
@Around("@annotation(observed)")
public Object observe(ProceedingJoinPoint pjp, ObserveLlmCall observed) throws Throwable {
String operationName = observed.value().isEmpty() ?
pjp.getSignature().getName() : observed.value();
long startTime = System.currentTimeMillis();
String traceId = MDC.get("traceId");
LlmCallLogger.LlmCallRecord.LlmCallRecordBuilder recordBuilder =
LlmCallLogger.LlmCallRecord.builder()
.traceId(traceId)
.timestamp(startTime);
try {
Object result = pjp.proceed();
long duration = System.currentTimeMillis() - startTime;
LlmCallLogger.LlmCallRecord record = recordBuilder
.success(true)
.totalDurationMs(duration)
.build();
callLogger.logLlmCall(record);
metricsCollector.recordLlmCall(record);
return result;
} catch (Exception e) {
long duration = System.currentTimeMillis() - startTime;
LlmCallLogger.LlmCallRecord record = recordBuilder
.success(false)
.totalDurationMs(duration)
.errorType(e.getClass().getSimpleName())
.errorMessage(e.getMessage())
.build();
callLogger.logLlmCall(record);
metricsCollector.recordLlmCall(record);
throw e;
}
}
}
@Target(ElementType.METHOD)
@Retention(RetentionPolicy.RUNTIME)
public @interface ObserveLlmCall {
String value() default "";
}可观测性是AI应用稳定运行的基础。没有完整的日志、指标和追踪,AI应用就是在"盲飞"——出了问题不知道哪里出的,优化了不知道有没有效果。
