第2344篇:Java AI服务的限流与熔断——保护下游LLM服务的工程策略
2026/4/30大约 5 分钟
第2344篇:Java AI服务的限流与熔断——保护下游LLM服务的工程策略
适读人群:负责AI服务稳定性的Java工程师,关注LLM API成本控制和故障隔离的开发者 | 阅读时长:约16分钟 | 核心价值:建立完整的限流熔断体系,保护LLM API同时保证用户体验
这篇文章和第2333篇(防止LLM调用风暴)有些交集,但侧重点不同。
第2333篇讲的是内部保护:防止自己的代码失控,保护LLM API。
这篇讲的是外部防御:当LLM API本身出问题时(速率限制、超时、宕机),如何让你的服务优雅地撑住,不把故障传导给用户。
两者都需要,但思路不同。
Resilience4j:AI服务的最佳伴侣
Spring Boot 3.x内置了Resilience4j的starter,是目前Java生态中功能最全的弹性库。
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-spring-boot3</artifactId>
</dependency>
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-reactor</artifactId>
</dependency>四层防护体系
配置
# application.yml:Resilience4j全量配置
resilience4j:
# 限流器:控制请求速率
ratelimiter:
instances:
llm-api:
limit-for-period: 10 # 每个刷新周期允许10个请求
limit-refresh-period: 1s # 刷新周期1秒
timeout-duration: 3s # 等待令牌最多3秒
# 舱壁:限制并发数(隔离故障影响范围)
bulkhead:
instances:
llm-api:
max-concurrent-calls: 20 # 最多20个并发
max-wait-duration: 5s # 等待进入舱壁最多5秒
# 超时控制
timelimiter:
instances:
llm-api:
timeout-duration: 30s # LLM调用最多等30秒
cancel-running-future: true # 超时时取消正在执行的Future
# 熔断器
circuitbreaker:
instances:
llm-api:
sliding-window-type: COUNT_BASED
sliding-window-size: 20 # 基于最近20次调用统计
minimum-number-of-calls: 10 # 至少10次调用后才统计
failure-rate-threshold: 50 # 50%失败率触发熔断
slow-call-rate-threshold: 80 # 80%慢调用也触发熔断
slow-call-duration-threshold: 15s # 超过15秒算慢调用
wait-duration-in-open-state: 30s # 熔断后等30秒
permitted-number-of-calls-in-half-open-state: 3
automatic-transition-from-open-to-half-open-enabled: true
# 哪些异常算作失败
record-exceptions:
- java.io.IOException
- java.util.concurrent.TimeoutException
- com.example.ai.exception.LlmServiceException
# 哪些异常不算失败(业务异常)
ignore-exceptions:
- com.example.ai.exception.ContentFilteredException
- com.example.ai.exception.InvalidInputException注解方式使用
@Service
@RequiredArgsConstructor
@Slf4j
public class ProtectedLlmService {
private final ChatClient chatClient;
@RateLimiter(name = "llm-api", fallbackMethod = "rateLimitFallback")
@Bulkhead(name = "llm-api", fallbackMethod = "bulkheadFallback")
@TimeLimiter(name = "llm-api")
@CircuitBreaker(name = "llm-api", fallbackMethod = "circuitBreakerFallback")
public CompletableFuture<String> chat(String userId, String message) {
return CompletableFuture.supplyAsync(() ->
chatClient.prompt()
.user(message)
.call()
.content()
);
}
// 限流降级
public CompletableFuture<String> rateLimitFallback(String userId, String message,
RequestNotPermitted e) {
log.warn("限流触发:userId={}", userId);
return CompletableFuture.completedFuture("请求过于频繁,请稍后再试(约1秒后)");
}
// 舱壁满降级
public CompletableFuture<String> bulkheadFallback(String userId, String message,
BulkheadFullException e) {
log.warn("舱壁满:userId={}, 当前并发已达上限", userId);
return CompletableFuture.completedFuture("服务繁忙,请等待约5秒后重试");
}
// 熔断降级
public CompletableFuture<String> circuitBreakerFallback(String userId, String message,
CallNotPermittedException e) {
log.warn("熔断触发:userId={}, 熔断器状态={}", userId, e.getMessage());
return CompletableFuture.completedFuture(
"AI服务暂时不可用,我们正在处理中。如需紧急帮助,请联系客服。");
}
}编程式使用:更灵活的控制
有时候注解方式不够灵活,编程式用法可以根据运行时状态动态调整:
@Service
@Slf4j
public class AdaptiveLlmService {
private final CircuitBreaker circuitBreaker;
private final RateLimiter rateLimiter;
private final Bulkhead bulkhead;
private final ChatClient chatClient;
private final ChatClient fallbackClient; // 备用的、更稳定的模型
public AdaptiveLlmService(
CircuitBreakerRegistry cbRegistry,
RateLimiterRegistry rlRegistry,
BulkheadRegistry bhRegistry,
ChatClient.Builder builder) {
this.circuitBreaker = cbRegistry.circuitBreaker("llm-api");
this.rateLimiter = rlRegistry.rateLimiter("llm-api");
this.bulkhead = bhRegistry.bulkhead("llm-api");
// 主模型(高质量)
this.chatClient = builder.build();
// 备用模型(稳定便宜)
this.fallbackClient = builder
.defaultSystem("你是简洁的助手,用简短的语言回答问题")
.build();
}
public String chat(String userId, String message) {
// 检查熔断器状态,决定用哪个模型
CircuitBreaker.State state = circuitBreaker.getState();
ChatClient activeClient = (state == CircuitBreaker.State.OPEN)
? fallbackClient // 熔断时用备用模型(不走熔断器保护)
: chatClient; // 正常时用主模型
// 组合多种保护
Supplier<String> decoratedSupplier = Decorators.ofSupplier(
() -> activeClient.prompt().user(message).call().content())
.withCircuitBreaker(circuitBreaker)
.withBulkhead(bulkhead)
.withRateLimiter(rateLimiter)
.withFallback(
List.of(CallNotPermittedException.class,
RequestNotPermitted.class,
BulkheadFullException.class),
ex -> handleFallback(userId, message, ex)
)
.decorate();
return decoratedSupplier.get();
}
private String handleFallback(String userId, String message, Throwable ex) {
log.warn("AI服务降级:userId={}, reason={}", userId, ex.getClass().getSimpleName());
if (ex instanceof CallNotPermittedException) {
return "AI服务正在恢复中,请在30秒后重试";
} else if (ex instanceof RequestNotPermitted) {
return "当前请求量较大,请稍后片刻";
} else {
return "AI服务暂时繁忙,请稍后重试";
}
}
// 监控各防护组件状态
public Map<String, Object> getProtectionStatus() {
return Map.of(
"circuitBreaker", Map.of(
"state", circuitBreaker.getState().name(),
"failureRate", circuitBreaker.getMetrics().getFailureRate(),
"slowCallRate", circuitBreaker.getMetrics().getSlowCallRate()
),
"rateLimiter", Map.of(
"availablePermissions", rateLimiter.getMetrics().getAvailablePermissions(),
"numberOfWaitingThreads", rateLimiter.getMetrics().getNumberOfWaitingThreads()
),
"bulkhead", Map.of(
"availableConcurrentCalls", bulkhead.getMetrics().getAvailableConcurrentCalls(),
"maxAllowedConcurrentCalls", bulkhead.getMetrics().getMaxAllowedConcurrentCalls()
)
);
}
}用户分级的差异化保护
不同用户等级享受不同的保护策略:
@Service
@RequiredArgsConstructor
public class TieredProtectionService {
private final Map<String, RateLimiter> tierRateLimiters;
private final CircuitBreaker circuitBreaker;
@PostConstruct
public void initRateLimiters() {
RateLimiterRegistry registry = RateLimiterRegistry.ofDefaults();
// 免费用户:每秒1个请求
tierRateLimiters.put("free", registry.rateLimiter("free-tier",
RateLimiterConfig.custom()
.limitForPeriod(1)
.limitRefreshPeriod(Duration.ofSeconds(1))
.build()));
// Pro用户:每秒5个请求
tierRateLimiters.put("pro", registry.rateLimiter("pro-tier",
RateLimiterConfig.custom()
.limitForPeriod(5)
.limitRefreshPeriod(Duration.ofSeconds(1))
.build()));
// Enterprise用户:每秒20个请求
tierRateLimiters.put("enterprise", registry.rateLimiter("enterprise-tier",
RateLimiterConfig.custom()
.limitForPeriod(20)
.limitRefreshPeriod(Duration.ofSeconds(1))
.build()));
}
public String chat(String userId, String userTier, String message) {
RateLimiter userRateLimiter = tierRateLimiters.getOrDefault(
userTier, tierRateLimiters.get("free"));
// 先过用户级限流,再过全局熔断
Supplier<String> call = Decorators.ofSupplier(
() -> callLlm(message))
.withRateLimiter(userRateLimiter) // 用户级
.withCircuitBreaker(circuitBreaker) // 全局
.decorate();
return call.get();
}
}限流和熔断不只是技术手段,更是一种承诺:对用户承诺系统会优雅降级而不是直接崩溃,对LLM API提供商承诺不会发送超出配额的请求。
