第2168篇:LLM系统的SLA设计——响应时间、可用性和质量的保障体系
2026/4/30大约 6 分钟
第2168篇:LLM系统的SLA设计——响应时间、可用性和质量的保障体系
适读人群:需要为LLM系统制定SLA的技术负责人和架构师 | 阅读时长:约18分钟 | 核心价值:为LLM系统设计合理的SLA,建立可执行的监控和保障机制
产品经理问:"AI助手的SLA是多少?"
"这个……LLM的响应时间不好保证,因为依赖外部API……"
"那可用性呢?"
"如果OpenAI挂了我们也没办法……"
"质量稳定性呢?"
"这个更难说了……"
这场对话说明了一个问题:我们把LLM系统当作实验性产品在对待,而不是生产级服务。真正的生产服务需要SLA,LLM系统也不例外。
但LLM的SLA确实比传统服务复杂,因为它涉及三个维度:响应时间、可用性、质量稳定性。
LLM SLA的三个维度
维度1:延迟(Latency)
- P50延迟(中位数):50%的请求在X秒内完成
- P95延迟:95%的请求在X秒内完成
- P99延迟:99%的请求在X秒内完成
- TTFT(Time to First Token):首个token的延迟(流式场景)
维度2:可用性(Availability)
- 服务可用率:每月可正常响应请求的时间比例
- 错误率:4xx/5xx的比例
维度3:质量稳定性(Quality SLA)
- 这是LLM特有的维度,传统服务没有
- 质量通过率:≥X%的请求达到质量标准
- 关键场景覆盖率:高风险场景的特别保障SLA监控与告警实现
/**
* LLM SLA监控服务
*
* 实时监控SLA各维度,超标即告警
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmSlaMonitoringService {
private final MeterRegistry meterRegistry;
private final AlertService alertService;
private final SlaConfigRepository slaConfigRepository;
// 滑动窗口数据(过去5分钟)
private final Deque<RequestMetrics> recentRequests = new ConcurrentLinkedDeque<>();
private static final Duration WINDOW = Duration.ofMinutes(5);
/**
* 记录每次请求的指标
*
* 在LLM请求完成后立即调用
*/
public void recordRequest(RequestRecord record) {
RequestMetrics metrics = RequestMetrics.builder()
.timestamp(Instant.now())
.latencyMs(record.getLatencyMs())
.ttftMs(record.getTtftMs())
.statusCode(record.getStatusCode())
.qualityScore(record.getQualityScore())
.qualityPassed(record.isQualityPassed())
.inputTokens(record.getInputTokens())
.outputTokens(record.getOutputTokens())
.build();
recentRequests.addLast(metrics);
cleanOldRecords();
// 实时记录到Prometheus
Timer.builder("llm.request.latency")
.tag("status", record.getStatusCode() < 400 ? "success" : "error")
.register(meterRegistry)
.record(record.getLatencyMs(), TimeUnit.MILLISECONDS);
if (record.getQualityScore() != null) {
meterRegistry.summary("llm.request.quality_score")
.record(record.getQualityScore());
}
}
/**
* 每分钟检查SLA状态
*/
@Scheduled(fixedDelay = 60000)
public void checkSlaStatus() {
SlaConfig sla = slaConfigRepository.getActiveSla();
List<RequestMetrics> windowData = new ArrayList<>(recentRequests);
if (windowData.isEmpty()) return;
SlaViolation violation = checkViolations(windowData, sla);
if (violation.hasViolation()) {
alertService.sendCriticalAlert("SLA违规", violation.buildAlertMessage());
}
// 更新SLA状态指标
updateSlaMetrics(windowData, sla);
}
private SlaViolation checkViolations(List<RequestMetrics> windowData, SlaConfig sla) {
SlaViolation violation = new SlaViolation();
// 1. 检查P95延迟
List<Long> latencies = windowData.stream()
.mapToLong(RequestMetrics::getLatencyMs)
.sorted().boxed().collect(Collectors.toList());
long p95Latency = latencies.get((int)(latencies.size() * 0.95));
if (p95Latency > sla.getP95LatencyMs()) {
violation.addViolation(String.format(
"P95延迟=%dms 超过SLA=%dms", p95Latency, sla.getP95LatencyMs()
));
}
// 2. 检查错误率
long errorCount = windowData.stream()
.filter(m -> m.getStatusCode() >= 400).count();
double errorRate = (double) errorCount / windowData.size();
if (errorRate > sla.getMaxErrorRate()) {
violation.addViolation(String.format(
"错误率=%.1f%% 超过SLA=%.1f%%",
errorRate * 100, sla.getMaxErrorRate() * 100
));
}
// 3. 检查质量通过率(只有有质量分数的请求才计入)
List<RequestMetrics> withQuality = windowData.stream()
.filter(m -> m.getQualityScore() != null)
.collect(Collectors.toList());
if (withQuality.size() >= 10) {
long qualityPassed = withQuality.stream()
.filter(RequestMetrics::isQualityPassed).count();
double qualityPassRate = (double) qualityPassed / withQuality.size();
if (qualityPassRate < sla.getMinQualityPassRate()) {
violation.addViolation(String.format(
"质量通过率=%.1f%% 低于SLA=%.1f%%",
qualityPassRate * 100, sla.getMinQualityPassRate() * 100
));
}
}
return violation;
}
private void updateSlaMetrics(List<RequestMetrics> windowData, SlaConfig sla) {
List<Long> latencies = windowData.stream()
.mapToLong(RequestMetrics::getLatencyMs)
.sorted().boxed().collect(Collectors.toList());
long p50 = latencies.get(latencies.size() / 2);
long p95 = latencies.get((int)(latencies.size() * 0.95));
long p99 = latencies.get((int)(latencies.size() * 0.99));
meterRegistry.gauge("llm.sla.latency_p50_ms", p50);
meterRegistry.gauge("llm.sla.latency_p95_ms", p95);
meterRegistry.gauge("llm.sla.latency_p99_ms", p99);
double errorRate = windowData.stream().filter(m -> m.getStatusCode() >= 400).count()
/ (double) windowData.size();
meterRegistry.gauge("llm.sla.error_rate", errorRate);
// SLA余量(距离违规还差多少)
double latencyBudget = 1.0 - (double) p95 / sla.getP95LatencyMs();
meterRegistry.gauge("llm.sla.latency_budget", latencyBudget);
}
private void cleanOldRecords() {
Instant cutoff = Instant.now().minus(WINDOW);
while (!recentRequests.isEmpty() &&
recentRequests.peekFirst().getTimestamp().isBefore(cutoff)) {
recentRequests.pollFirst();
}
}
}可用性保障:熔断与降级
/**
* LLM服务熔断器
*
* 当错误率超过阈值时,自动熔断,触发降级逻辑
*/
@Component
@RequiredArgsConstructor
@Slf4j
public class LlmCircuitBreaker {
@Value("${llm.circuit-breaker.failure-threshold:0.5}")
private double failureThreshold;
@Value("${llm.circuit-breaker.window-size:20}")
private int windowSize;
@Value("${llm.circuit-breaker.cool-down-seconds:60}")
private int coolDownSeconds;
private final AtomicInteger failureCount = new AtomicInteger(0);
private final AtomicInteger totalCount = new AtomicInteger(0);
private final AtomicBoolean isOpen = new AtomicBoolean(false);
private volatile Instant openedAt;
/**
* 执行LLM调用,带熔断保护
*/
public String callWithCircuitBreaker(Supplier<String> llmCall, Supplier<String> fallback) {
// 检查熔断器状态
if (isOpen.get()) {
if (shouldAttemptReset()) {
// 半开状态:尝试一次请求
log.info("熔断器半开,尝试探测请求");
} else {
log.warn("熔断器开启,使用降级回答");
return fallback.get();
}
}
try {
String result = llmCall.get();
recordSuccess();
return result;
} catch (Exception e) {
recordFailure();
log.error("LLM调用失败,触发熔断记录", e);
if (isOpen.get()) {
return fallback.get();
}
throw e;
}
}
private void recordSuccess() {
totalCount.incrementAndGet();
// 成功时关闭熔断器
if (isOpen.get()) {
isOpen.set(false);
failureCount.set(0);
totalCount.set(0);
log.info("熔断器关闭,服务恢复正常");
}
}
private void recordFailure() {
int failures = failureCount.incrementAndGet();
int total = totalCount.incrementAndGet();
if (total >= windowSize) {
double failureRate = (double) failures / total;
if (failureRate >= failureThreshold && !isOpen.get()) {
isOpen.set(true);
openedAt = Instant.now();
log.error("熔断器开启!失败率={:.1f}%(阈值{:.1f}%)",
failureRate * 100, failureThreshold * 100);
}
// 滑动窗口:重置计数
if (total >= windowSize * 2) {
failureCount.set(failures / 2);
totalCount.set(total / 2);
}
}
}
private boolean shouldAttemptReset() {
return openedAt != null &&
Instant.now().isAfter(openedAt.plusSeconds(coolDownSeconds));
}
}质量SLA的技术实现
/**
* 质量SLA保障服务
*
* 实现质量相关的SLA保障机制
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class QualitySlaEnforcer {
private final LlmEvaluationService evaluationService;
private final HallucinationMonitoringService hallucinationMonitor;
@Value("${sla.quality.min-score:0.70}")
private double minQualityScore;
@Value("${sla.quality.max-hallucination-rate:0.10}")
private double maxHallucinationRate;
/**
* 质量门控:生成输出后立即检查,低于标准则触发重试
*
* 注意:这会增加延迟,只在高风险场景使用
*/
public String generateWithQualityGate(String userInput, String context,
Supplier<String> generator) {
int maxRetries = 2;
String bestOutput = null;
double bestScore = 0;
for (int attempt = 0; attempt <= maxRetries; attempt++) {
String output = generator.get();
// 快速质量检查(只用规则,不调用LLM评估器,避免延迟翻倍)
double quickScore = quickQualityCheck(userInput, output, context);
if (quickScore > bestScore) {
bestScore = quickScore;
bestOutput = output;
}
if (quickScore >= minQualityScore) {
log.debug("质量门控通过,attempt={}, score={}", attempt, quickScore);
break;
}
if (attempt < maxRetries) {
log.warn("质量门控未通过(score={}),重试 {}/{}", quickScore, attempt + 1, maxRetries);
}
}
return bestOutput;
}
/**
* 快速质量检查(基于规则,不需要LLM调用)
*
* 这是质量门控的轻量级版本,适合在生产请求路径上使用
*/
private double quickQualityCheck(String input, String output, String context) {
double score = 1.0;
// 检查1:输出长度是否合理
if (output.length() < 20) { score -= 0.4; }
else if (output.length() > 2000) { score -= 0.1; }
// 检查2:是否包含拒绝回答的模式
if (isRefusal(output)) { score -= 0.3; }
// 检查3:语言一致性
if (!isLanguageConsistent(input, output)) { score -= 0.2; }
// 检查4:是否包含重复内容
if (hasRepetition(output)) { score -= 0.2; }
// 检查5:如果有context,检查是否使用了context
if (context != null && !context.isEmpty() && !hasContextReference(output, context)) {
score -= 0.1;
}
return Math.max(0, score);
}
private boolean isRefusal(String output) {
return output.contains("我无法") || output.contains("我不能") ||
output.contains("对不起,我") || output.contains("I cannot");
}
private boolean isLanguageConsistent(String input, String output) {
boolean inputChinese = input.chars().filter(c -> c >= 0x4E00 && c <= 0x9FA5).count() > input.length() * 0.3;
boolean outputChinese = output.chars().filter(c -> c >= 0x4E00 && c <= 0x9FA5).count() > output.length() * 0.3;
return inputChinese == outputChinese;
}
private boolean hasRepetition(String output) {
String[] sentences = output.split("[。!?.!?]");
Set<String> seen = new HashSet<>();
for (String s : sentences) {
String trimmed = s.trim();
if (trimmed.length() > 10 && !seen.add(trimmed)) return true;
}
return false;
}
private boolean hasContextReference(String output, String context) {
// 简单检查:输出是否引用了context中的关键词
String[] words = context.split("[\\s,,。!?]+");
long matchCount = Arrays.stream(words)
.filter(w -> w.length() > 3 && output.contains(w))
.count();
return matchCount >= 2;
}
}制定合理的SLA目标
最后,SLA目标本身的制定也是工程问题:
不要承诺无法履行的SLA。如果你依赖GPT-4 API,而GPT-4的P99延迟是8秒,你的SLA就不应该写P99 < 5秒。
分场景设置不同的SLA。实时聊天场景需要P95 < 3秒;批量报告生成可以接受P95 < 30秒。不要用一个SLA覆盖所有场景。
质量SLA要保守。质量指标受太多因素影响,刚上线时不要承诺过高的质量通过率,先积累基线数据,再基于数据制定合理目标。
