第2055篇：AI应用可观测性——追踪LLM调用链路的完整方案

老张2026/4/30大约 5 分钟

第2055篇：AI应用可观测性——追踪LLM调用链路的完整方案

适读人群：负责AI应用运维和质量保障的工程师 | 阅读时长：约19分钟 | 核心价值：建立LLM应用的完整可观测性体系，快速定位问题和优化性能

上线第一周，用户反馈"AI回答很慢"。

我登上去查日志，发现日志里只有"LLM调用成功"，没有任何关于哪个阶段慢的信息。根本没办法定位问题。

这是AI应用可观测性不够的典型问题。LLM应用的调用链路比普通接口复杂——RAG检索、Prompt组装、LLM调用、后处理——每个阶段都可能是瓶颈，必须有完整的追踪。

LLM应用的可观测性模型

结构化日志

/**
 * LLM调用的结构化日志
 * 使用结构化格式（JSON），方便日志聚合和分析
 */
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmCallLogger {
    
    private final ObjectMapper objectMapper;
    
    /**
     * 记录完整的LLM调用信息
     */
    public void logLlmCall(LlmCallRecord record) {
        try {
            // 结构化日志：使用MDC + JSON格式
            String json = objectMapper.writeValueAsString(record);
            
            if (record.isSuccess()) {
                log.info("LLM_CALL {}", json);
            } else {
                log.warn("LLM_CALL_FAILED {}", json);
            }
        } catch (JsonProcessingException e) {
            log.error("记录LLM调用日志失败", e);
        }
    }
    
    @Data
    @Builder
    public static class LlmCallRecord {
        private String traceId;
        private String spanId;
        private String userId;
        private String sessionId;
        private String modelName;
        
        // Prompt信息（注意脱敏）
        private String systemPromptHash;    // System Prompt的哈希（不记录原文，防止泄露）
        private String userInputPreview;    // 用户输入的前100字符
        private int userInputLength;
        
        // Token使用
        private int inputTokens;
        private int outputTokens;
        private int totalTokens;
        
        // 性能
        private long retrievalDurationMs;   // RAG检索耗时
        private long promptAssemblyMs;      // Prompt组装耗时
        private long llmCallDurationMs;     // LLM调用耗时
        private long totalDurationMs;       // 总耗时
        
        // 结果
        private boolean success;
        private String errorType;
        private String errorMessage;
        
        // RAG相关
        private int retrievedDocCount;
        private double topSimilarityScore;
        
        // 成本估算
        private double estimatedCost;
        
        private long timestamp;
    }
}

分布式追踪集成

/**
 * 集成OpenTelemetry的LLM追踪
 * 可以把LLM调用链路展示在Jaeger/Zipkin等追踪系统中
 */
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmTracingService {
    
    private final Tracer tracer;  // OpenTelemetry Tracer
    
    /**
     * 创建LLM调用的追踪Span
     */
    public <T> T traceRagQuery(String operationName, 
                                String query, 
                                Supplier<T> operation) {
        Span span = tracer.spanBuilder("llm." + operationName)
            .setAttribute("llm.query", truncate(query, 200))
            .setAttribute("llm.operation", operationName)
            .startSpan();
        
        try (Scope scope = span.makeCurrent()) {
            T result = operation.get();
            span.setStatus(StatusCode.OK);
            return result;
        } catch (Exception e) {
            span.setStatus(StatusCode.ERROR, e.getMessage());
            span.recordException(e);
            throw e;
        } finally {
            span.end();
        }
    }
    
    /**
     * RAG流程的完整追踪
     */
    public String tracedRagQuery(String userId, String query, 
                                   RagQueryExecutor executor) {
        Span rootSpan = tracer.spanBuilder("rag.query")
            .setAttribute("user.id", userId)
            .setAttribute("query.length", query.length())
            .startSpan();
        
        try (Scope rootScope = rootSpan.makeCurrent()) {
            // 追踪：向量检索
            List<String> contexts = traceRagQuery("retrieval", query, () -> {
                Span retrievalSpan = tracer.spanBuilder("rag.retrieval")
                    .startSpan();
                try (Scope s = retrievalSpan.makeCurrent()) {
                    List<String> docs = executor.retrieve(query);
                    retrievalSpan.setAttribute("retrieved.count", docs.size());
                    return docs;
                } finally {
                    retrievalSpan.end();
                }
            });
            
            // 追踪：LLM调用
            String answer = traceRagQuery("generation", query, () -> {
                Span generationSpan = tracer.spanBuilder("rag.generation")
                    .setAttribute("context.count", contexts.size())
                    .startSpan();
                try (Scope s = generationSpan.makeCurrent()) {
                    String result = executor.generate(query, contexts);
                    generationSpan.setAttribute("answer.length", result.length());
                    return result;
                } finally {
                    generationSpan.end();
                }
            });
            
            rootSpan.setStatus(StatusCode.OK);
            return answer;
            
        } catch (Exception e) {
            rootSpan.setStatus(StatusCode.ERROR, e.getMessage());
            rootSpan.recordException(e);
            throw e;
        } finally {
            rootSpan.end();
        }
    }
    
    private String truncate(String text, int maxLength) {
        return text.length() > maxLength ? text.substring(0, maxLength) + "..." : text;
    }
    
    public interface RagQueryExecutor {
        List<String> retrieve(String query);
        String generate(String query, List<String> contexts);
    }
}

Prometheus指标收集

/**
 * LLM应用的Prometheus指标
 * 配合Grafana做可视化监控
 */
@Component
@RequiredArgsConstructor
public class LlmMetricsCollector {
    
    // 计数器
    private final Counter llmCallTotal;
    private final Counter llmCallErrors;
    private final Counter tokenInputTotal;
    private final Counter tokenOutputTotal;
    
    // 直方图（用于计算百分位延迟）
    private final DistributionSummary llmCallDuration;
    private final DistributionSummary retrievalDuration;
    
    // Gauge（当前状态）
    private final AtomicDouble activeSessionCount = new AtomicDouble(0);
    
    public LlmMetricsCollector(MeterRegistry registry) {
        this.llmCallTotal = Counter.builder("llm_calls_total")
            .description("LLM调用总次数")
            .register(registry);
        
        this.llmCallErrors = Counter.builder("llm_call_errors_total")
            .description("LLM调用失败次数")
            .register(registry);
        
        this.tokenInputTotal = Counter.builder("llm_tokens_input_total")
            .description("输入Token总量")
            .register(registry);
        
        this.tokenOutputTotal = Counter.builder("llm_tokens_output_total")
            .description("输出Token总量")
            .register(registry);
        
        this.llmCallDuration = DistributionSummary.builder("llm_call_duration_ms")
            .description("LLM调用耗时（毫秒）")
            .percentilePrecision(2)
            .publishPercentiles(0.5, 0.95, 0.99)
            .register(registry);
        
        this.retrievalDuration = DistributionSummary.builder("rag_retrieval_duration_ms")
            .description("RAG检索耗时（毫秒）")
            .percentilePrecision(2)
            .publishPercentiles(0.5, 0.95, 0.99)
            .register(registry);
        
        Gauge.builder("llm_active_sessions", activeSessionCount, AtomicDouble::get)
            .description("当前活跃会话数")
            .register(registry);
    }
    
    public void recordLlmCall(LlmCallRecord record) {
        llmCallTotal.increment();
        
        if (!record.isSuccess()) {
            llmCallErrors.increment();
        }
        
        tokenInputTotal.increment(record.getInputTokens());
        tokenOutputTotal.increment(record.getOutputTokens());
        llmCallDuration.record(record.getLlmCallDurationMs());
        
        if (record.getRetrievalDurationMs() > 0) {
            retrievalDuration.record(record.getRetrievalDurationMs());
        }
    }
    
    public void incrementActiveSessions() {
        activeSessionCount.addAndGet(1);
    }
    
    public void decrementActiveSessions() {
        activeSessionCount.addAndGet(-1);
    }
}

Grafana监控大盘配置

重要的监控指标及其告警阈值：

# grafana-dashboard-alerts.yaml
# 核心指标告警配置

alerts:
  - name: "LLM调用P99延迟过高"
    query: "histogram_quantile(0.99, llm_call_duration_ms)"
    threshold: 10000   # 超过10秒告警
    severity: "critical"
    
  - name: "LLM错误率过高"
    query: "rate(llm_call_errors_total[5m]) / rate(llm_calls_total[5m])"
    threshold: 0.05    # 超过5%告警
    severity: "warning"
    
  - name: "Token消耗速率异常"
    query: "rate(llm_tokens_input_total[1h]) > 1000000"  # 每小时超100万Token
    threshold: 1000000
    severity: "warning"
    
  - name: "RAG检索耗时过长"
    query: "histogram_quantile(0.95, rag_retrieval_duration_ms)"
    threshold: 2000    # P95超过2秒
    severity: "warning"

完整的可观测性集成示例

/**
 * 把日志、指标、追踪集成到一个AOP切面
 */
@Aspect
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmObservabilityAspect {
    
    private final LlmCallLogger callLogger;
    private final LlmMetricsCollector metricsCollector;
    private final LlmTracingService tracingService;
    
    @Around("@annotation(observed)")
    public Object observe(ProceedingJoinPoint pjp, ObserveLlmCall observed) throws Throwable {
        String operationName = observed.value().isEmpty() ? 
            pjp.getSignature().getName() : observed.value();
        
        long startTime = System.currentTimeMillis();
        String traceId = MDC.get("traceId");
        
        LlmCallLogger.LlmCallRecord.LlmCallRecordBuilder recordBuilder = 
            LlmCallLogger.LlmCallRecord.builder()
                .traceId(traceId)
                .timestamp(startTime);
        
        try {
            Object result = pjp.proceed();
            long duration = System.currentTimeMillis() - startTime;
            
            LlmCallLogger.LlmCallRecord record = recordBuilder
                .success(true)
                .totalDurationMs(duration)
                .build();
            
            callLogger.logLlmCall(record);
            metricsCollector.recordLlmCall(record);
            
            return result;
            
        } catch (Exception e) {
            long duration = System.currentTimeMillis() - startTime;
            
            LlmCallLogger.LlmCallRecord record = recordBuilder
                .success(false)
                .totalDurationMs(duration)
                .errorType(e.getClass().getSimpleName())
                .errorMessage(e.getMessage())
                .build();
            
            callLogger.logLlmCall(record);
            metricsCollector.recordLlmCall(record);
            
            throw e;
        }
    }
}

@Target(ElementType.METHOD)
@Retention(RetentionPolicy.RUNTIME)
public @interface ObserveLlmCall {
    String value() default "";
}

可观测性是AI应用稳定运行的基础。没有完整的日志、指标和追踪，AI应用就是在"盲飞"——出了问题不知道哪里出的，优化了不知道有没有效果。