用 Spring Boot Actuator 监控 AI 应用——自定义健康检查和指标
2026/4/30大约 9 分钟
用 Spring Boot Actuator 监控 AI 应用——自定义健康检查和指标
凌晨两点,我的手机响了。告警说我们的 AI 服务"不健康"。
我打开看,Spring Boot Actuator 的 /health 接口返回了 DOWN。但进去一查,是 Redis 连接池稍微有点满,触发了默认的健康检查阈值。实际上 AI 服务的所有核心功能完全正常——向量库在线,LLM API 可用,对话功能没有任何问题。
这次误告警让我开始重新审视 AI 应用的监控设计。
问题的根本是:Spring Boot Actuator 的默认健康检查是为通用 Web 应用设计的,它不了解 AI 应用的特殊性:
- 向量库(Qdrant、Pinecone、Chroma)的可用性
- LLM API 的可用性(不只是"能不能 ping 通",而是"能不能真的调用")
- 模型加载状态(如果用了本地模型)
- 语义缓存的命中率(影响性能但不影响可用性)
- Token 配额剩余量(快用完时应该告警但不应该标记为不健康)
今天这篇文章,把 AI 应用专属的 Actuator 扩展方案系统梳理一遍。
Actuator 的扩展点
Spring Boot Actuator 有几个核心扩展点:
- HealthIndicator:自定义健康检查组件,影响
/health接口的状态 - HealthContributor:组合多个 HealthIndicator
- @Endpoint:自定义 Actuator 端点,暴露任意数据
- MeterRegistry(Micrometer):注册自定义指标,推送到 Prometheus/Grafana
对于 AI 应用,我的建议是:
- 用 HealthIndicator 做核心依赖的健康检查(向量库、LLM API)
- 用 @Endpoint 暴露 AI 专属的运行时信息(当前模型、配额状态、缓存统计)
- 用 MeterRegistry 记录业务指标(token 消耗、延迟分布、错误率)
代码:AI 健康检查实现
LLM API 健康检查
import org.springframework.boot.actuate.health.Health;
import org.springframework.boot.actuate.health.HealthIndicator;
import org.springframework.stereotype.Component;
import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
/**
* LLM API 健康检查
*
* 注意:这个健康检查不会每次请求时都真正调用 LLM(那会很贵)
* 而是通过以下策略:
* 1. 使用极短的轻量级探测请求(如 token 计数接口)
* 2. 观察最近N次真实请求的成功率
* 3. 定期执行探测,结果缓存一段时间
*/
@Component("llmApiHealth")
public class LLMApiHealthIndicator implements HealthIndicator {
private static final Logger log = LoggerFactory.getLogger(LLMApiHealthIndicator.class);
@Autowired
private ChatModel chatModel;
// 缓存上次健康检查结果,避免每次 /health 都调用 API
private final AtomicReference<CachedHealthResult> cachedResult = new AtomicReference<>();
private static final Duration CACHE_DURATION = Duration.ofMinutes(5);
// 最近真实调用的统计(滑动窗口)
private final AtomicLong recentSuccessCount = new AtomicLong(0);
private final AtomicLong recentFailureCount = new AtomicLong(0);
private final AtomicLong lastWindowReset = new AtomicLong(System.currentTimeMillis());
private static final Duration STATS_WINDOW = Duration.ofMinutes(15);
@Override
public Health health() {
// 先检查缓存
CachedHealthResult cached = cachedResult.get();
if (cached != null && !cached.isExpired()) {
return cached.health;
}
// 缓存失效,执行检查
Health result = checkLLMHealth();
cachedResult.set(new CachedHealthResult(result));
return result;
}
private Health checkLLMHealth() {
// 策略1:检查最近调用的成功率
resetWindowIfNeeded();
long success = recentSuccessCount.get();
long failure = recentFailureCount.get();
long total = success + failure;
if (total >= 10) {
// 有足够的样本,基于实际成功率判断
double failureRate = (double) failure / total * 100;
if (failureRate > 50) {
return Health.down()
.withDetail("reason", "最近调用失败率过高")
.withDetail("failureRate", String.format("%.1f%%", failureRate))
.withDetail("totalCalls", total)
.build();
}
}
// 策略2:使用最轻量的 API 端点做探测(如果提供商支持)
try {
long startMs = System.currentTimeMillis();
// 发送一个极短的探测请求(非流式,极少 token)
// 注意:这个探测也会消耗 token,应该设置很短的 maxTokens
String probeResult = callProbe();
long durationMs = System.currentTimeMillis() - startMs;
return Health.up()
.withDetail("responseTimeMs", durationMs)
.withDetail("recentSuccessCount", success)
.withDetail("recentFailureCount", failure)
.withDetail("probeStatus", "ok")
.build();
} catch (Exception e) {
log.warn("LLM API 健康探测失败: {}", e.getMessage());
return Health.down()
.withDetail("reason", "探测请求失败")
.withDetail("error", e.getMessage())
.withDetail("recentSuccessCount", success)
.withDetail("recentFailureCount", failure)
.build();
}
}
private String callProbe() {
// 使用最小的探测请求
AnthropicChatOptions probeOptions = AnthropicChatOptions.builder()
.withMaxTokens(5) // 只要5个token就够了,节省成本
.build();
Prompt probePrompt = new Prompt("Hi", probeOptions);
ChatResponse response = chatModel.call(probePrompt);
return response.getResult().getOutput().getContent();
}
/**
* 供外部调用:记录真实调用结果,用于成功率统计
*/
public void recordSuccess() {
resetWindowIfNeeded();
recentSuccessCount.incrementAndGet();
}
public void recordFailure() {
resetWindowIfNeeded();
recentFailureCount.incrementAndGet();
}
private void resetWindowIfNeeded() {
long now = System.currentTimeMillis();
long lastReset = lastWindowReset.get();
if (now - lastReset > STATS_WINDOW.toMillis()) {
if (lastWindowReset.compareAndSet(lastReset, now)) {
recentSuccessCount.set(0);
recentFailureCount.set(0);
}
}
}
static class CachedHealthResult {
final Health health;
final Instant expiresAt;
CachedHealthResult(Health health) {
this.health = health;
this.expiresAt = Instant.now().plus(CACHE_DURATION);
}
boolean isExpired() {
return Instant.now().isAfter(expiresAt);
}
}
}向量库健康检查
@Component("vectorStoreHealth")
public class VectorStoreHealthIndicator implements HealthIndicator {
@Autowired(required = false)
private VectorStore vectorStore; // Spring AI 的向量库接口
@Autowired(required = false)
private QdrantClient qdrantClient; // 如果直接用 Qdrant 客户端
@Override
public Health health() {
if (vectorStore == null && qdrantClient == null) {
return Health.unknown()
.withDetail("reason", "向量库未配置")
.build();
}
try {
// 执行一个简单的查询来验证向量库连接
long startMs = System.currentTimeMillis();
boolean alive = checkVectorStoreAlive();
long durationMs = System.currentTimeMillis() - startMs;
if (alive) {
return Health.up()
.withDetail("responseTimeMs", durationMs)
.withDetail("type", getVectorStoreType())
.build();
} else {
return Health.down()
.withDetail("reason", "向量库连接检查失败")
.build();
}
} catch (Exception e) {
return Health.down()
.withDetail("reason", "向量库连接异常")
.withDetail("error", e.getMessage())
.build();
}
}
private boolean checkVectorStoreAlive() throws Exception {
if (qdrantClient != null) {
// Qdrant 有专门的健康检查接口
qdrantClient.healthCheckAsync().get(3, TimeUnit.SECONDS);
return true;
}
if (vectorStore != null) {
// 执行一个零向量的相似性搜索(快速,成本低)
List<Document> results = vectorStore.similaritySearch(
SearchRequest.query("health check probe")
.withTopK(1)
.withSimilarityThreshold(0.99) // 极高阈值,实际不会返回结果
);
return true; // 不抛异常就算健康
}
return false;
}
private String getVectorStoreType() {
if (qdrantClient != null) return "qdrant";
if (vectorStore != null) return vectorStore.getClass().getSimpleName();
return "unknown";
}
}组合健康检查
@Configuration
public class AIHealthConfiguration {
/**
* 将 AI 相关的健康检查组合成一个组
* 这样可以在 /health/ai 下统一查看
*/
@Bean
public HealthContributorRegistry aiHealthContributorRegistry(
LLMApiHealthIndicator llmApiHealth,
VectorStoreHealthIndicator vectorStoreHealth) {
// 通过 Spring 的自动配置,已经注册了
// 如果需要额外控制,可以在这里做组合
return null;
}
}application.yml 中配置健康检查组:
management:
health:
# 对外暴露的健康端点
show-details: when-authorized
endpoint:
health:
group:
# 核心 AI 组(影响整体 UP/DOWN 判断)
ai-core:
include: llmApiHealth, vectorStoreHealth
# 辅助 AI 组(仅告警,不影响整体状态)
ai-auxiliary:
include: aiCacheHealth, aiQuotaHealth
endpoints:
web:
exposure:
include: health, info, metrics, ai-status, ai-quota代码:自定义 Actuator Endpoint
AI 综合状态端点
import org.springframework.boot.actuate.endpoint.annotation.*;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* 自定义 Actuator 端点:AI 运行状态
* 访问路径:/actuator/ai-status
*/
@Component
@Endpoint(id = "ai-status")
public class AIStatusEndpoint {
@Autowired
private ChatModel chatModel;
@Autowired
private AIMetricsCollector metricsCollector;
@Autowired
private ModelConfigManager modelConfigManager;
/**
* GET /actuator/ai-status
* 返回 AI 服务的完整运行状态
*/
@ReadOperation
public Map<String, Object> getStatus() {
Map<String, Object> status = new LinkedHashMap<>();
// 当前时间
status.put("timestamp", LocalDateTime.now().toString());
// 模型配置状态
status.put("models", getModelStatus());
// 性能指标
status.put("performance", getPerformanceMetrics());
// 用量统计
status.put("usage", getUsageStats());
// 缓存状态
status.put("cache", getCacheStats());
return status;
}
/**
* GET /actuator/ai-status/{component}
* 返回特定组件的状态
*/
@ReadOperation
public Map<String, Object> getComponentStatus(@Selector String component) {
return switch (component) {
case "models" -> getModelStatus();
case "performance" -> getPerformanceMetrics();
case "usage" -> getUsageStats();
case "cache" -> getCacheStats();
default -> Map.of("error", "未知组件: " + component);
};
}
private Map<String, Object> getModelStatus() {
Map<String, Object> models = new LinkedHashMap<>();
// 当前激活的模型配置
models.put("primary", modelConfigManager.getPrimaryModel());
models.put("fallback", modelConfigManager.getFallbackModel());
models.put("embeddingModel", modelConfigManager.getEmbeddingModel());
// 各模型的可用性状态
models.put("availability", modelConfigManager.getModelAvailability());
return models;
}
private Map<String, Object> getPerformanceMetrics() {
Map<String, Object> perf = new LinkedHashMap<>();
AIMetricsCollector.PerformanceSnapshot snapshot =
metricsCollector.getPerformanceSnapshot();
perf.put("avgResponseTimeMs", snapshot.getAvgResponseTimeMs());
perf.put("p95ResponseTimeMs", snapshot.getP95ResponseTimeMs());
perf.put("p99ResponseTimeMs", snapshot.getP99ResponseTimeMs());
perf.put("requestsPerMinute", snapshot.getRequestsPerMinute());
perf.put("errorRatePercent", snapshot.getErrorRatePercent());
return perf;
}
private Map<String, Object> getUsageStats() {
Map<String, Object> usage = new LinkedHashMap<>();
AIMetricsCollector.UsageSnapshot usageSnapshot =
metricsCollector.getUsageSnapshot();
usage.put("todayInputTokens", usageSnapshot.getTodayInputTokens());
usage.put("todayOutputTokens", usageSnapshot.getTodayOutputTokens());
usage.put("todayEstimatedCostUsd", usageSnapshot.getTodayEstimatedCostUsd());
usage.put("monthlyInputTokens", usageSnapshot.getMonthlyInputTokens());
usage.put("monthlyOutputTokens", usageSnapshot.getMonthlyOutputTokens());
usage.put("monthlyEstimatedCostUsd", usageSnapshot.getMonthlyEstimatedCostUsd());
return usage;
}
private Map<String, Object> getCacheStats() {
Map<String, Object> cache = new LinkedHashMap<>();
// 语义缓存统计
cache.put("semanticCacheHits", metricsCollector.getSemanticCacheHits());
cache.put("semanticCacheMisses", metricsCollector.getSemanticCacheMisses());
cache.put("semanticCacheHitRate",
metricsCollector.getSemanticCacheHitRate() + "%");
cache.put("estimatedCostSavedByCache",
metricsCollector.getEstimatedCostSavedByCache());
return cache;
}
}AI 配额监控端点
/**
* 自定义 Actuator 端点:AI 配额状态
* 访问路径:/actuator/ai-quota
*/
@Component
@Endpoint(id = "ai-quota")
public class AIQuotaEndpoint {
@Autowired
private QuotaManager quotaManager;
@Autowired
private RedisTemplate<String, Object> redisTemplate;
@ReadOperation
public Map<String, Object> getQuotaStatus() {
Map<String, Object> result = new LinkedHashMap<>();
// 各服务的配额使用情况
Map<String, Map<String, Object>> serviceQuotas = new LinkedHashMap<>();
List<String> serviceIds = List.of(
"product-service", "order-service",
"customer-service", "recommend-service");
for (String serviceId : serviceIds) {
Map<String, Object> quotaInfo = new LinkedHashMap<>();
QuotaStatus status = quotaManager.getQuotaStatus(serviceId);
quotaInfo.put("used", status.getUsed());
quotaInfo.put("limit", status.getLimit());
quotaInfo.put("usedPercent",
String.format("%.1f%%", status.getUsedPercent()));
quotaInfo.put("resetAt", status.getResetAt());
// 配额使用超过80%时标记警告
if (status.getUsedPercent() > 80) {
quotaInfo.put("warning", "配额使用已超过80%");
}
serviceQuotas.put(serviceId, quotaInfo);
}
result.put("services", serviceQuotas);
result.put("timestamp", LocalDateTime.now().toString());
return result;
}
/**
* 写操作:重置某个服务的配额(管理员操作)
* POST /actuator/ai-quota/{serviceId}/reset
*/
@WriteOperation
public Map<String, String> resetServiceQuota(@Selector String serviceId) {
// 验证是否有管理权限(生产环境中应该严格控制)
quotaManager.resetQuota(serviceId);
return Map.of(
"status", "success",
"message", "服务 " + serviceId + " 的配额已重置"
);
}
}Micrometer 自定义指标
@Component
public class AIMetricsRegistrar {
private final MeterRegistry meterRegistry;
private final AIMetricsCollector metricsCollector;
// 自定义计量器
private final Counter tokenInputCounter;
private final Counter tokenOutputCounter;
private final Counter cacheHitCounter;
private final Counter cacheMissCounter;
private final DistributionSummary responseTimeHistogram;
public AIMetricsRegistrar(MeterRegistry meterRegistry,
AIMetricsCollector metricsCollector) {
this.meterRegistry = meterRegistry;
this.metricsCollector = metricsCollector;
// 注册计数器
this.tokenInputCounter = Counter.builder("ai.tokens.input")
.description("AI调用消耗的输入Token总数")
.tag("service", "ai-gateway")
.register(meterRegistry);
this.tokenOutputCounter = Counter.builder("ai.tokens.output")
.description("AI调用消耗的输出Token总数")
.tag("service", "ai-gateway")
.register(meterRegistry);
this.cacheHitCounter = Counter.builder("ai.cache.hits")
.description("语义缓存命中次数")
.register(meterRegistry);
this.cacheMissCounter = Counter.builder("ai.cache.misses")
.description("语义缓存未命中次数")
.register(meterRegistry);
// 注册响应时间直方图
this.responseTimeHistogram = DistributionSummary.builder("ai.response.time")
.description("AI调用响应时间分布(毫秒)")
.baseUnit("ms")
.publishPercentiles(0.5, 0.95, 0.99)
.publishPercentileHistogram()
.register(meterRegistry);
// 注册 Gauge(实时值)
Gauge.builder("ai.quota.remaining.percent",
metricsCollector,
collector -> collector.getOverallQuotaRemainingPercent())
.description("AI配额剩余百分比")
.register(meterRegistry);
Gauge.builder("ai.circuit.breaker.state",
metricsCollector,
collector -> collector.getCircuitBreakerStateValue())
.description("熔断器状态 (0=CLOSED, 1=OPEN, 2=HALF_OPEN)")
.register(meterRegistry);
}
/**
* 记录一次 AI 调用的指标
*/
public void recordAICall(String model, int inputTokens, int outputTokens,
long durationMs, boolean success, boolean fromCache) {
// Token 计数
tokenInputCounter.increment(inputTokens);
tokenOutputCounter.increment(outputTokens);
// 缓存统计
if (fromCache) {
cacheHitCounter.increment();
} else {
cacheMissCounter.increment();
}
// 响应时间(只统计真实调用,缓存命中不统计延迟)
if (!fromCache) {
responseTimeHistogram.record(durationMs);
}
// 带标签的计数(可以按模型、按成功/失败分类)
Counter.builder("ai.calls.total")
.tag("model", model)
.tag("success", String.valueOf(success))
.tag("from_cache", String.valueOf(fromCache))
.register(meterRegistry)
.increment();
}
}配置细节
management:
endpoints:
web:
exposure:
include: health, info, metrics, ai-status, ai-quota, prometheus
base-path: /actuator
endpoint:
health:
show-details: always
show-components: always
# 哪些健康检查影响整体 UP/DOWN 状态
group:
liveness:
include: ping
additional-path: server:/livez # K8s liveness probe
readiness:
include: llmApiHealth, vectorStoreHealth, db, redis
additional-path: server:/readyz # K8s readiness probe
# Prometheus 指标格式
prometheus:
metrics:
export:
enabled: true
# 指标标签
metrics:
tags:
application: ${spring.application.name}
environment: ${spring.profiles.active:local}总结
AI 应用的 Actuator 扩展核心思路:
健康检查要反映 AI 核心依赖:向量库、LLM API——这些不健康才应该影响整体状态。Redis 连接池稍微满一点,不应该把 AI 服务标记成不健康。
LLM API 健康检查要缓存:每次
/health调用都去真正探测 LLM API 既贵又慢,用 5 分钟缓存 + 基于最近调用成功率的双重策略更合理。自定义端点暴露 AI 专属信息:token 消耗、模型路由、配额状态——这些是 AI 应用特有的运行时信息,标准端点看不到。
Micrometer 指标要覆盖 AI 维度:响应时间分布、缓存命中率、token 消耗趋势——这些指标是发现 AI 性能问题的关键线索。
