用 Spring Boot Actuator 监控 AI 应用——自定义健康检查和指标

老张2026/4/30大约 9 分钟

用 Spring Boot Actuator 监控 AI 应用——自定义健康检查和指标

凌晨两点，我的手机响了。告警说我们的 AI 服务"不健康"。

我打开看，Spring Boot Actuator 的 /health 接口返回了 DOWN。但进去一查，是 Redis 连接池稍微有点满，触发了默认的健康检查阈值。实际上 AI 服务的所有核心功能完全正常——向量库在线，LLM API 可用，对话功能没有任何问题。

这次误告警让我开始重新审视 AI 应用的监控设计。

问题的根本是：Spring Boot Actuator 的默认健康检查是为通用 Web 应用设计的，它不了解 AI 应用的特殊性：

向量库（Qdrant、Pinecone、Chroma）的可用性
LLM API 的可用性（不只是"能不能 ping 通"，而是"能不能真的调用"）
模型加载状态（如果用了本地模型）
语义缓存的命中率（影响性能但不影响可用性）
Token 配额剩余量（快用完时应该告警但不应该标记为不健康）

今天这篇文章，把 AI 应用专属的 Actuator 扩展方案系统梳理一遍。

Actuator 的扩展点

Spring Boot Actuator 有几个核心扩展点：

HealthIndicator：自定义健康检查组件，影响 /health 接口的状态
HealthContributor：组合多个 HealthIndicator
@Endpoint：自定义 Actuator 端点，暴露任意数据
MeterRegistry（Micrometer）：注册自定义指标，推送到 Prometheus/Grafana

对于 AI 应用，我的建议是：

用 HealthIndicator 做核心依赖的健康检查（向量库、LLM API）
用 @Endpoint 暴露 AI 专属的运行时信息（当前模型、配额状态、缓存统计）
用 MeterRegistry 记录业务指标（token 消耗、延迟分布、错误率）

代码：AI 健康检查实现

LLM API 健康检查

import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.HealthIndicator;
import org.springframework.stereotype.Component;

import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

/**
 * LLM API 健康检查
 * 
 * 注意：这个健康检查不会每次请求时都真正调用 LLM（那会很贵）
 * 而是通过以下策略：
 * 1. 使用极短的轻量级探测请求（如 token 计数接口）
 * 2. 观察最近N次真实请求的成功率
 * 3. 定期执行探测，结果缓存一段时间
 */
@Component("llmApiHealth")
public class LLMApiHealthIndicator implements HealthIndicator {

    private static final Logger log = LoggerFactory.getLogger(LLMApiHealthIndicator.class);

    @Autowired
    private ChatModel chatModel;

    // 缓存上次健康检查结果，避免每次 /health 都调用 API
    private final AtomicReference<CachedHealthResult> cachedResult = new AtomicReference<>();
    private static final Duration CACHE_DURATION = Duration.ofMinutes(5);

    // 最近真实调用的统计（滑动窗口）
    private final AtomicLong recentSuccessCount = new AtomicLong(0);
    private final AtomicLong recentFailureCount = new AtomicLong(0);
    private final AtomicLong lastWindowReset = new AtomicLong(System.currentTimeMillis());
    private static final Duration STATS_WINDOW = Duration.ofMinutes(15);

    @Override
    public Health health() {
        // 先检查缓存
        CachedHealthResult cached = cachedResult.get();
        if (cached != null && !cached.isExpired()) {
            return cached.health;
        }

        // 缓存失效，执行检查
        Health result = checkLLMHealth();
        cachedResult.set(new CachedHealthResult(result));
        return result;
    }

    private Health checkLLMHealth() {
        // 策略1：检查最近调用的成功率
        resetWindowIfNeeded();
        long success = recentSuccessCount.get();
        long failure = recentFailureCount.get();
        long total = success + failure;

        if (total >= 10) {
            // 有足够的样本，基于实际成功率判断
            double failureRate = (double) failure / total * 100;
            if (failureRate > 50) {
                return Health.down()
                        .withDetail("reason", "最近调用失败率过高")
                        .withDetail("failureRate", String.format("%.1f%%", failureRate))
                        .withDetail("totalCalls", total)
                        .build();
            }
        }

        // 策略2：使用最轻量的 API 端点做探测（如果提供商支持）
        try {
            long startMs = System.currentTimeMillis();
            // 发送一个极短的探测请求（非流式，极少 token）
            // 注意：这个探测也会消耗 token，应该设置很短的 maxTokens
            String probeResult = callProbe();
            long durationMs = System.currentTimeMillis() - startMs;

            return Health.up()
                    .withDetail("responseTimeMs", durationMs)
                    .withDetail("recentSuccessCount", success)
                    .withDetail("recentFailureCount", failure)
                    .withDetail("probeStatus", "ok")
                    .build();

        } catch (Exception e) {
            log.warn("LLM API 健康探测失败: {}", e.getMessage());
            return Health.down()
                    .withDetail("reason", "探测请求失败")
                    .withDetail("error", e.getMessage())
                    .withDetail("recentSuccessCount", success)
                    .withDetail("recentFailureCount", failure)
                    .build();
        }
    }

    private String callProbe() {
        // 使用最小的探测请求
        AnthropicChatOptions probeOptions = AnthropicChatOptions.builder()
                .withMaxTokens(5)  // 只要5个token就够了，节省成本
                .build();

        Prompt probePrompt = new Prompt("Hi", probeOptions);
        ChatResponse response = chatModel.call(probePrompt);
        return response.getResult().getOutput().getContent();
    }

    /**
     * 供外部调用：记录真实调用结果，用于成功率统计
     */
    public void recordSuccess() {
        resetWindowIfNeeded();
        recentSuccessCount.incrementAndGet();
    }

    public void recordFailure() {
        resetWindowIfNeeded();
        recentFailureCount.incrementAndGet();
    }

    private void resetWindowIfNeeded() {
        long now = System.currentTimeMillis();
        long lastReset = lastWindowReset.get();
        if (now - lastReset > STATS_WINDOW.toMillis()) {
            if (lastWindowReset.compareAndSet(lastReset, now)) {
                recentSuccessCount.set(0);
                recentFailureCount.set(0);
            }
        }
    }

    static class CachedHealthResult {
        final Health health;
        final Instant expiresAt;

        CachedHealthResult(Health health) {
            this.health = health;
            this.expiresAt = Instant.now().plus(CACHE_DURATION);
        }

        boolean isExpired() {
            return Instant.now().isAfter(expiresAt);
        }
    }
}

向量库健康检查

@Component("vectorStoreHealth")
public class VectorStoreHealthIndicator implements HealthIndicator {

    @Autowired(required = false)
    private VectorStore vectorStore; // Spring AI 的向量库接口

    @Autowired(required = false)
    private QdrantClient qdrantClient; // 如果直接用 Qdrant 客户端

    @Override
    public Health health() {
        if (vectorStore == null && qdrantClient == null) {
            return Health.unknown()
                    .withDetail("reason", "向量库未配置")
                    .build();
        }

        try {
            // 执行一个简单的查询来验证向量库连接
            long startMs = System.currentTimeMillis();
            boolean alive = checkVectorStoreAlive();
            long durationMs = System.currentTimeMillis() - startMs;

            if (alive) {
                return Health.up()
                        .withDetail("responseTimeMs", durationMs)
                        .withDetail("type", getVectorStoreType())
                        .build();
            } else {
                return Health.down()
                        .withDetail("reason", "向量库连接检查失败")
                        .build();
            }

        } catch (Exception e) {
            return Health.down()
                    .withDetail("reason", "向量库连接异常")
                    .withDetail("error", e.getMessage())
                    .build();
        }
    }

    private boolean checkVectorStoreAlive() throws Exception {
        if (qdrantClient != null) {
            // Qdrant 有专门的健康检查接口
            qdrantClient.healthCheckAsync().get(3, TimeUnit.SECONDS);
            return true;
        }

        if (vectorStore != null) {
            // 执行一个零向量的相似性搜索（快速，成本低）
            List<Document> results = vectorStore.similaritySearch(
                    SearchRequest.query("health check probe")
                            .withTopK(1)
                            .withSimilarityThreshold(0.99) // 极高阈值，实际不会返回结果
            );
            return true; // 不抛异常就算健康
        }

        return false;
    }

    private String getVectorStoreType() {
        if (qdrantClient != null) return "qdrant";
        if (vectorStore != null) return vectorStore.getClass().getSimpleName();
        return "unknown";
    }
}

组合健康检查

@Configuration
public class AIHealthConfiguration {

    /**
     * 将 AI 相关的健康检查组合成一个组
     * 这样可以在 /health/ai 下统一查看
     */
    @Bean
    public HealthContributorRegistry aiHealthContributorRegistry(
            LLMApiHealthIndicator llmApiHealth,
            VectorStoreHealthIndicator vectorStoreHealth) {

        // 通过 Spring 的自动配置，已经注册了
        // 如果需要额外控制，可以在这里做组合
        return null;
    }
}

application.yml 中配置健康检查组：

management:
  health:
    # 对外暴露的健康端点
    show-details: when-authorized

  endpoint:
    health:
      group:
        # 核心 AI 组（影响整体 UP/DOWN 判断）
        ai-core:
          include: llmApiHealth, vectorStoreHealth
        # 辅助 AI 组（仅告警，不影响整体状态）
        ai-auxiliary:
          include: aiCacheHealth, aiQuotaHealth

  endpoints:
    web:
      exposure:
        include: health, info, metrics, ai-status, ai-quota

代码：自定义 Actuator Endpoint

AI 综合状态端点

import org.springframework.boot.actuate.endpoint.annotation.*;
import org.springframework.stereotype.Component;

import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

/**
 * 自定义 Actuator 端点：AI 运行状态
 * 访问路径：/actuator/ai-status
 */
@Component
@Endpoint(id = "ai-status")
public class AIStatusEndpoint {

    @Autowired
    private ChatModel chatModel;

    @Autowired
    private AIMetricsCollector metricsCollector;

    @Autowired
    private ModelConfigManager modelConfigManager;

    /**
     * GET /actuator/ai-status
     * 返回 AI 服务的完整运行状态
     */
    @ReadOperation
    public Map<String, Object> getStatus() {
        Map<String, Object> status = new LinkedHashMap<>();

        // 当前时间
        status.put("timestamp", LocalDateTime.now().toString());

        // 模型配置状态
        status.put("models", getModelStatus());

        // 性能指标
        status.put("performance", getPerformanceMetrics());

        // 用量统计
        status.put("usage", getUsageStats());

        // 缓存状态
        status.put("cache", getCacheStats());

        return status;
    }

    /**
     * GET /actuator/ai-status/{component}
     * 返回特定组件的状态
     */
    @ReadOperation
    public Map<String, Object> getComponentStatus(@Selector String component) {
        return switch (component) {
            case "models" -> getModelStatus();
            case "performance" -> getPerformanceMetrics();
            case "usage" -> getUsageStats();
            case "cache" -> getCacheStats();
            default -> Map.of("error", "未知组件: " + component);
        };
    }

    private Map<String, Object> getModelStatus() {
        Map<String, Object> models = new LinkedHashMap<>();

        // 当前激活的模型配置
        models.put("primary", modelConfigManager.getPrimaryModel());
        models.put("fallback", modelConfigManager.getFallbackModel());
        models.put("embeddingModel", modelConfigManager.getEmbeddingModel());

        // 各模型的可用性状态
        models.put("availability", modelConfigManager.getModelAvailability());

        return models;
    }

    private Map<String, Object> getPerformanceMetrics() {
        Map<String, Object> perf = new LinkedHashMap<>();

        AIMetricsCollector.PerformanceSnapshot snapshot =
                metricsCollector.getPerformanceSnapshot();

        perf.put("avgResponseTimeMs", snapshot.getAvgResponseTimeMs());
        perf.put("p95ResponseTimeMs", snapshot.getP95ResponseTimeMs());
        perf.put("p99ResponseTimeMs", snapshot.getP99ResponseTimeMs());
        perf.put("requestsPerMinute", snapshot.getRequestsPerMinute());
        perf.put("errorRatePercent", snapshot.getErrorRatePercent());

        return perf;
    }

    private Map<String, Object> getUsageStats() {
        Map<String, Object> usage = new LinkedHashMap<>();

        AIMetricsCollector.UsageSnapshot usageSnapshot =
                metricsCollector.getUsageSnapshot();

        usage.put("todayInputTokens", usageSnapshot.getTodayInputTokens());
        usage.put("todayOutputTokens", usageSnapshot.getTodayOutputTokens());
        usage.put("todayEstimatedCostUsd", usageSnapshot.getTodayEstimatedCostUsd());
        usage.put("monthlyInputTokens", usageSnapshot.getMonthlyInputTokens());
        usage.put("monthlyOutputTokens", usageSnapshot.getMonthlyOutputTokens());
        usage.put("monthlyEstimatedCostUsd", usageSnapshot.getMonthlyEstimatedCostUsd());

        return usage;
    }

    private Map<String, Object> getCacheStats() {
        Map<String, Object> cache = new LinkedHashMap<>();

        // 语义缓存统计
        cache.put("semanticCacheHits", metricsCollector.getSemanticCacheHits());
        cache.put("semanticCacheMisses", metricsCollector.getSemanticCacheMisses());
        cache.put("semanticCacheHitRate",
                metricsCollector.getSemanticCacheHitRate() + "%");
        cache.put("estimatedCostSavedByCache",
                metricsCollector.getEstimatedCostSavedByCache());

        return cache;
    }
}

AI 配额监控端点

/**
 * 自定义 Actuator 端点：AI 配额状态
 * 访问路径：/actuator/ai-quota
 */
@Component
@Endpoint(id = "ai-quota")
public class AIQuotaEndpoint {

    @Autowired
    private QuotaManager quotaManager;

    @Autowired
    private RedisTemplate<String, Object> redisTemplate;

    @ReadOperation
    public Map<String, Object> getQuotaStatus() {
        Map<String, Object> result = new LinkedHashMap<>();

        // 各服务的配额使用情况
        Map<String, Map<String, Object>> serviceQuotas = new LinkedHashMap<>();
        List<String> serviceIds = List.of(
                "product-service", "order-service",
                "customer-service", "recommend-service");

        for (String serviceId : serviceIds) {
            Map<String, Object> quotaInfo = new LinkedHashMap<>();
            QuotaStatus status = quotaManager.getQuotaStatus(serviceId);

            quotaInfo.put("used", status.getUsed());
            quotaInfo.put("limit", status.getLimit());
            quotaInfo.put("usedPercent",
                    String.format("%.1f%%", status.getUsedPercent()));
            quotaInfo.put("resetAt", status.getResetAt());

            // 配额使用超过80%时标记警告
            if (status.getUsedPercent() > 80) {
                quotaInfo.put("warning", "配额使用已超过80%");
            }

            serviceQuotas.put(serviceId, quotaInfo);
        }

        result.put("services", serviceQuotas);
        result.put("timestamp", LocalDateTime.now().toString());

        return result;
    }

    /**
     * 写操作：重置某个服务的配额（管理员操作）
     * POST /actuator/ai-quota/{serviceId}/reset
     */
    @WriteOperation
    public Map<String, String> resetServiceQuota(@Selector String serviceId) {
        // 验证是否有管理权限（生产环境中应该严格控制）
        quotaManager.resetQuota(serviceId);
        return Map.of(
                "status", "success",
                "message", "服务 " + serviceId + " 的配额已重置"
        );
    }
}

Micrometer 自定义指标

@Component
public class AIMetricsRegistrar {

    private final MeterRegistry meterRegistry;
    private final AIMetricsCollector metricsCollector;

    // 自定义计量器
    private final Counter tokenInputCounter;
    private final Counter tokenOutputCounter;
    private final Counter cacheHitCounter;
    private final Counter cacheMissCounter;
    private final DistributionSummary responseTimeHistogram;

    public AIMetricsRegistrar(MeterRegistry meterRegistry,
                               AIMetricsCollector metricsCollector) {
        this.meterRegistry = meterRegistry;
        this.metricsCollector = metricsCollector;

        // 注册计数器
        this.tokenInputCounter = Counter.builder("ai.tokens.input")
                .description("AI调用消耗的输入Token总数")
                .tag("service", "ai-gateway")
                .register(meterRegistry);

        this.tokenOutputCounter = Counter.builder("ai.tokens.output")
                .description("AI调用消耗的输出Token总数")
                .tag("service", "ai-gateway")
                .register(meterRegistry);

        this.cacheHitCounter = Counter.builder("ai.cache.hits")
                .description("语义缓存命中次数")
                .register(meterRegistry);

        this.cacheMissCounter = Counter.builder("ai.cache.misses")
                .description("语义缓存未命中次数")
                .register(meterRegistry);

        // 注册响应时间直方图
        this.responseTimeHistogram = DistributionSummary.builder("ai.response.time")
                .description("AI调用响应时间分布(毫秒)")
                .baseUnit("ms")
                .publishPercentiles(0.5, 0.95, 0.99)
                .publishPercentileHistogram()
                .register(meterRegistry);

        // 注册 Gauge（实时值）
        Gauge.builder("ai.quota.remaining.percent",
                metricsCollector,
                collector -> collector.getOverallQuotaRemainingPercent())
                .description("AI配额剩余百分比")
                .register(meterRegistry);

        Gauge.builder("ai.circuit.breaker.state",
                metricsCollector,
                collector -> collector.getCircuitBreakerStateValue())
                .description("熔断器状态 (0=CLOSED, 1=OPEN, 2=HALF_OPEN)")
                .register(meterRegistry);
    }

    /**
     * 记录一次 AI 调用的指标
     */
    public void recordAICall(String model, int inputTokens, int outputTokens,
                              long durationMs, boolean success, boolean fromCache) {
        // Token 计数
        tokenInputCounter.increment(inputTokens);
        tokenOutputCounter.increment(outputTokens);

        // 缓存统计
        if (fromCache) {
            cacheHitCounter.increment();
        } else {
            cacheMissCounter.increment();
        }

        // 响应时间（只统计真实调用，缓存命中不统计延迟）
        if (!fromCache) {
            responseTimeHistogram.record(durationMs);
        }

        // 带标签的计数（可以按模型、按成功/失败分类）
        Counter.builder("ai.calls.total")
                .tag("model", model)
                .tag("success", String.valueOf(success))
                .tag("from_cache", String.valueOf(fromCache))
                .register(meterRegistry)
                .increment();
    }
}

配置细节

management:
  endpoints:
    web:
      exposure:
        include: health, info, metrics, ai-status, ai-quota, prometheus
      base-path: /actuator

  endpoint:
    health:
      show-details: always
      show-components: always
      # 哪些健康检查影响整体 UP/DOWN 状态
      group:
        liveness:
          include: ping
          additional-path: server:/livez  # K8s liveness probe
        readiness:
          include: llmApiHealth, vectorStoreHealth, db, redis
          additional-path: server:/readyz  # K8s readiness probe

  # Prometheus 指标格式
  prometheus:
    metrics:
      export:
        enabled: true

  # 指标标签
  metrics:
    tags:
      application: ${spring.application.name}
      environment: ${spring.profiles.active:local}

总结

AI 应用的 Actuator 扩展核心思路：

健康检查要反映 AI 核心依赖：向量库、LLM API——这些不健康才应该影响整体状态。Redis 连接池稍微满一点，不应该把 AI 服务标记成不健康。
LLM API 健康检查要缓存：每次 /health 调用都去真正探测 LLM API 既贵又慢，用 5 分钟缓存 + 基于最近调用成功率的双重策略更合理。
自定义端点暴露 AI 专属信息：token 消耗、模型路由、配额状态——这些是 AI 应用特有的运行时信息，标准端点看不到。
Micrometer 指标要覆盖 AI 维度：响应时间分布、缓存命中率、token 消耗趋势——这些指标是发现 AI 性能问题的关键线索。