AI应用的灾难恢复:保证AI系统99.99%可用性的设计
AI应用的灾难恢复:保证AI系统99.99%可用性的设计
一、那次价值百万的教训
2025年11月15日,上午10点32分。
北京某金融科技公司的AI风控系统,突然陷入一片寂静。
负责人赵磊第一时间收到告警:OpenAI API服务不可用。
这不是第一次了,但这次特别糟糕——公司正在进行年度最大的一次促销活动,每分钟有约3000笔贷款申请需要AI风控审核。
10:32 - AI风控系统返回503
10:33 - 运营团队紧急呼叫技术
10:35 - 确认是OpenAI全球性故障
10:40 - 技术团队开始讨论方案
10:55 - 方案未定,系统仍然宕机
11:15 - OpenAI恢复服务2小时43分钟的宕机,具体损失:
- 积压贷款申请:约4.9万笔(需要人工处理)
- 人工处理加班费:12万元
- 延迟审核导致用户流失:估算损失贷款利息收入 82万元
- 品牌声誉损失:难以估量
- 合计直接经济损失:约94万元
事后赵磊复盘:"我们在AI功能上花了300万,却完全没有考虑当AI供应商宕机时怎么办。"
这次,我要教你设计一套AI应用的灾难恢复体系,让类似的悲剧不再发生。
二、AI应用的可用性目标:4个9意味着什么
2.1 SLA数字的现实含义
| SLA级别 | 年允许停机时间 | 月允许停机时间 | 周允许停机时间 |
|---|---|---|---|
| 99% | 3天15小时 | 7.3小时 | 1.7小时 |
| 99.9% | 8.7小时 | 43.2分钟 | 10.1分钟 |
| 99.99% | 52.6分钟 | 4.3分钟 | 60.5秒 |
| 99.999% | 5.26分钟 | 26秒 | 6秒 |
AI应用的特殊性: AI响应本身就有延迟(通常2-10秒),因此AI系统的可用性不只是"服务不挂",还要考虑P99延迟。
2.2 AI应用可用性的3个层次
2.3 故障树分析(Fault Tree Analysis)
三、多供应商冗余:主备LLM自动切换
3.1 多LLM供应商路由架构
3.2 智能LLM路由器(完整实现)
package com.laozhang.ai.failover;
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
import io.github.resilience4j.circuitbreaker.CircuitBreakerRegistry;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.messages.Message;
import org.springframework.stereotype.Service;
import java.time.Duration;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Supplier;
/**
* 多LLM供应商智能路由器
* 实现自动故障检测、主备切换、健康恢复
*/
@Slf4j
@Service
public class LlmFailoverRouter {
private final List<LlmProvider> providers;
private final CircuitBreakerRegistry circuitBreakerRegistry;
// 当前活跃供应商索引
private final AtomicInteger activeProviderIndex = new AtomicInteger(0);
public LlmFailoverRouter(
ChatClient openAiClient,
ChatClient anthropicClient,
ChatClient qwenClient,
ChatClient doubaoClient,
CircuitBreakerRegistry circuitBreakerRegistry
) {
this.circuitBreakerRegistry = circuitBreakerRegistry;
// 供应商优先级列表(从高到低)
this.providers = List.of(
new LlmProvider("openai-gpt4o", openAiClient, 1, 60000),
new LlmProvider("anthropic-claude", anthropicClient, 2, 60000),
new LlmProvider("qwen-max", qwenClient, 3, 30000),
new LlmProvider("doubao-pro", doubaoClient, 4, 30000)
);
}
/**
* 发送消息,自动处理故障切换
*/
public LlmResponse chat(String systemPrompt, String userMessage) {
List<Exception> errors = new java.util.ArrayList<>();
// 尝试所有供应商,直到有一个成功
for (int i = 0; i < providers.size(); i++) {
int providerIndex = (activeProviderIndex.get() + i) % providers.size();
LlmProvider provider = providers.get(providerIndex);
// 获取该供应商的断路器
CircuitBreaker circuitBreaker = circuitBreakerRegistry.circuitBreaker(provider.name());
// 断路器开路时,跳过此供应商
if (circuitBreaker.getState() == CircuitBreaker.State.OPEN) {
log.warn("供应商断路器开路,跳过: provider={}", provider.name());
continue;
}
try {
long startTime = System.currentTimeMillis();
// 通过断路器装饰调用
String content = CircuitBreaker.decorateSupplier(
circuitBreaker,
() -> callProvider(provider, systemPrompt, userMessage)
).get();
long duration = System.currentTimeMillis() - startTime;
// 成功!记录并返回
if (i > 0) {
log.warn("使用备用供应商成功: provider={}, fallbackLevel={}", provider.name(), i);
// 更新活跃供应商(粘滞效果:下次优先用成功的那个)
// 不立即更新activeProviderIndex,等主供应商断路器恢复后自动切回
}
log.info("LLM请求成功: provider={}, duration={}ms", provider.name(), duration);
return new LlmResponse(
true, content, provider.name(), i > 0, duration, null
);
} catch (Exception e) {
errors.add(e);
log.error("供应商调用失败: provider={}, error={}", provider.name(), e.getMessage());
}
}
// 所有供应商都失败
log.error("所有LLM供应商均不可用,共{}个错误", errors.size());
return new LlmResponse(
false, null, null, false, 0,
"所有AI服务提供商当前不可用,请稍后重试"
);
}
private String callProvider(LlmProvider provider, String systemPrompt, String userMessage) {
return provider.client()
.prompt()
.system(systemPrompt)
.user(userMessage)
.call()
.content();
}
public record LlmProvider(
String name,
ChatClient client,
int priority,
int timeoutMs
) {}
public record LlmResponse(
boolean success,
String content,
String usedProvider,
boolean isFailover,
long durationMs,
String errorMessage
) {}
}3.3 Resilience4j断路器配置
# application.yml
resilience4j:
circuitbreaker:
configs:
# 所有LLM供应商共享的基础配置
llm-default:
# 当失败率超过50%时,开启断路器
failure-rate-threshold: 50
# 当慢调用率超过50%时,也开启断路器
slow-call-rate-threshold: 50
# 超过30秒认为是慢调用
slow-call-duration-threshold: 30s
# 统计窗口:最近10次调用
sliding-window-size: 10
sliding-window-type: count_based
# 断路器开路后,等待60秒再尝试半开
wait-duration-in-open-state: 60s
# 半开状态允许5次探测请求
permitted-number-of-calls-in-half-open-state: 5
# 从关闭→开路需要最少5次调用的统计数据
minimum-number-of-calls: 5
instances:
openai-gpt4o:
base-config: llm-default
# OpenAI重要,恢复后快速切回
wait-duration-in-open-state: 30s
anthropic-claude:
base-config: llm-default
qwen-max:
base-config: llm-default
# 备用供应商允许更高的失败率
failure-rate-threshold: 60
doubao-pro:
base-config: llm-default
failure-rate-threshold: 60
# 超时配置(防止慢请求阻塞线程池)
timelimiter:
configs:
llm-timeout:
timeout-duration: 45s
cancel-running-future: true
instances:
openai-gpt4o:
base-config: llm-timeout
anthropic-claude:
timeout-duration: 50s四、降级策略:LLM不可用时的优雅降级
4.1 降级策略层次
4.2 智能降级服务实现
package com.laozhang.ai.failover.degradation;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.stereotype.Service;
import java.time.Duration;
import java.util.Optional;
/**
* AI服务降级策略管理器
* 当LLM不可用时,按优先级尝试降级方案
*/
@Slf4j
@Service
@RequiredArgsConstructor
public class AiDegradationService {
private final RedisTemplate<String, String> redisTemplate;
private final RuleEngineService ruleEngineService;
private final FaqRepository faqRepository;
private final AiResponseCache responseCache;
/**
* 执行降级策略
* @param query 用户查询
* @param context 业务上下文
* @param scene 业务场景(risk-control / customer-service / etc.)
* @return 降级响应
*/
public DegradedResponse degrade(String query, String context, String scene) {
log.warn("LLM不可用,启动降级策略: scene={}, query={}",
scene, query.substring(0, Math.min(50, query.length())));
// 降级策略1:语义缓存(最优先,性能最好)
Optional<String> cachedResponse = responseCache.findSimilar(query, 0.92f);
if (cachedResponse.isPresent()) {
log.info("降级成功(语义缓存命中): scene={}", scene);
return DegradedResponse.fromCache(cachedResponse.get());
}
// 降级策略2:规则引擎(适合结构化场景,如风控)
if (ruleEngineService.isApplicable(scene, query)) {
try {
String ruleResult = ruleEngineService.evaluate(query, context);
log.info("降级成功(规则引擎): scene={}", scene);
return DegradedResponse.fromRule(ruleResult);
} catch (Exception e) {
log.warn("规则引擎降级失败: scene={}, error={}", scene, e.getMessage());
}
}
// 降级策略3:FAQ匹配(适合客服场景)
if ("customer-service".equals(scene)) {
Optional<String> faqAnswer = faqRepository.findBestMatch(query, 0.85f);
if (faqAnswer.isPresent()) {
log.info("降级成功(FAQ匹配): scene={}", scene);
return DegradedResponse.fromFaq(faqAnswer.get());
}
}
// 降级策略4:预设的兜底响应
String fallbackMessage = getFallbackMessage(scene);
log.info("使用兜底响应: scene={}", scene);
return DegradedResponse.fallback(fallbackMessage);
}
private String getFallbackMessage(String scene) {
return switch (scene) {
case "risk-control" -> "当前AI风控服务繁忙,您的申请已进入人工审核队列," +
"预计30分钟内处理。感谢您的耐心等待。";
case "customer-service" -> "非常抱歉,AI助手暂时不可用。" +
"您可以通过以下方式联系我们:\n" +
"• 人工客服热线:400-xxx-xxxx(9:00-18:00)\n" +
"• 在线留言,我们会在2小时内回复";
case "content-generation" -> "内容生成服务暂时维护中,预计15分钟后恢复。" +
"您可以先保存草稿,稍后再试。";
default -> "AI服务暂时不可用,请稍后重试或联系客服。";
};
}
public record DegradedResponse(
String content,
DegradationType type,
boolean isFullFunctionality,
String notice // 告知用户的降级说明
) {
public enum DegradationType {
SEMANTIC_CACHE, RULE_ENGINE, FAQ, FALLBACK
}
public static DegradedResponse fromCache(String content) {
return new DegradedResponse(content, DegradationType.SEMANTIC_CACHE,
true, "(来自缓存,可能不是最新)");
}
public static DegradedResponse fromRule(String content) {
return new DegradedResponse(content, DegradationType.RULE_ENGINE,
false, "(当前为简化版响应,AI服务恢复后将提供完整服务)");
}
public static DegradedResponse fromFaq(String content) {
return new DegradedResponse(content, DegradationType.FAQ,
false, "(来自FAQ知识库,如需个性化解答请联系人工)");
}
public static DegradedResponse fallback(String content) {
return new DegradedResponse(content, DegradationType.FALLBACK,
false, "");
}
}
}4.3 语义响应缓存
package com.laozhang.ai.failover.cache;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.embedding.EmbeddingModel;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.stereotype.Component;
import java.time.Duration;
import java.util.Optional;
import java.util.Set;
/**
* AI响应语义缓存
* 基于向量相似度查找语义相似的历史响应
*
* 原理:相似的问题应该有相似的答案
* 例:"北京明天天气" ≈ "明天北京天气如何" → 同一缓存
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class AiResponseCache {
private final EmbeddingModel embeddingModel;
private final RedisTemplate<String, Object> redisTemplate;
private static final String CACHE_KEY_PREFIX = "ai:response:cache:";
private static final String VECTOR_INDEX_KEY = "ai:response:vectors";
private static final Duration CACHE_TTL = Duration.ofHours(24);
private static final int MAX_CACHE_SIZE = 10000;
/**
* 查找语义相似的缓存响应
*/
public Optional<String> findSimilar(String query, float similarityThreshold) {
try {
float[] queryVector = embeddingModel.embed(query);
// 遍历缓存向量,找相似度最高的
// 生产环境应使用Redis向量扩展(RedisSearch)或外部向量库
Set<Object> cachedKeys = redisTemplate.opsForZSet().range(VECTOR_INDEX_KEY, 0, -1);
if (cachedKeys == null || cachedKeys.isEmpty()) {
return Optional.empty();
}
String bestKey = null;
float bestSimilarity = 0f;
for (Object keyObj : cachedKeys) {
String key = keyObj.toString();
Object vectorObj = redisTemplate.opsForHash().get(CACHE_KEY_PREFIX + key, "vector");
if (vectorObj == null) continue;
float[] cachedVector = (float[]) vectorObj;
float similarity = cosineSimilarity(queryVector, cachedVector);
if (similarity > bestSimilarity && similarity >= similarityThreshold) {
bestSimilarity = similarity;
bestKey = key;
}
}
if (bestKey != null) {
String cachedResponse = (String) redisTemplate.opsForHash().get(
CACHE_KEY_PREFIX + bestKey, "response"
);
log.debug("语义缓存命中: similarity={}, key={}", bestSimilarity, bestKey);
return Optional.ofNullable(cachedResponse);
}
return Optional.empty();
} catch (Exception e) {
log.error("语义缓存查询失败: error={}", e.getMessage());
return Optional.empty();
}
}
/**
* 缓存AI响应(每次成功的LLM响应都应该缓存)
*/
public void cacheResponse(String query, String response) {
try {
float[] queryVector = embeddingModel.embed(query);
String cacheKey = generateCacheKey(query);
// 存储向量和响应
redisTemplate.opsForHash().put(CACHE_KEY_PREFIX + cacheKey, "vector", queryVector);
redisTemplate.opsForHash().put(CACHE_KEY_PREFIX + cacheKey, "response", response);
redisTemplate.opsForHash().put(CACHE_KEY_PREFIX + cacheKey, "query", query);
redisTemplate.expire(CACHE_KEY_PREFIX + cacheKey, CACHE_TTL);
// 维护向量索引(使用ZSet存储,score为时间戳)
redisTemplate.opsForZSet().add(VECTOR_INDEX_KEY, cacheKey, System.currentTimeMillis());
// 控制缓存大小
Long size = redisTemplate.opsForZSet().size(VECTOR_INDEX_KEY);
if (size != null && size > MAX_CACHE_SIZE) {
// 删除最旧的条目
redisTemplate.opsForZSet().removeRange(VECTOR_INDEX_KEY, 0, 0);
}
log.debug("AI响应已缓存: query={}", query.substring(0, Math.min(50, query.length())));
} catch (Exception e) {
log.error("AI响应缓存失败: error={}", e.getMessage());
// 缓存失败不影响主流程
}
}
private String generateCacheKey(String query) {
return String.valueOf(Math.abs(query.hashCode()));
}
private float cosineSimilarity(float[] a, float[] b) {
if (a.length != b.length) return 0f;
double dot = 0, normA = 0, normB = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return (float)(dot / (Math.sqrt(normA) * Math.sqrt(normB)));
}
}五、向量数据库高可用
5.1 向量库主备配置(以Qdrant为例)
# Docker Compose 高可用部署
version: '3.8'
services:
qdrant-primary:
image: qdrant/qdrant:v1.9.0
ports:
- "6333:6333"
volumes:
- qdrant-primary-data:/qdrant/storage
environment:
QDRANT__CLUSTER__ENABLED: "true"
QDRANT__CLUSTER__P2P__PORT: "6335"
networks:
- qdrant-cluster
qdrant-replica-1:
image: qdrant/qdrant:v1.9.0
volumes:
- qdrant-replica1-data:/qdrant/storage
environment:
QDRANT__CLUSTER__ENABLED: "true"
QDRANT__CLUSTER__BOOTSTRAP__PEER_URLS: "http://qdrant-primary:6335"
networks:
- qdrant-cluster
qdrant-replica-2:
image: qdrant/qdrant:v1.9.0
volumes:
- qdrant-replica2-data:/qdrant/storage
environment:
QDRANT__CLUSTER__ENABLED: "true"
QDRANT__CLUSTER__BOOTSTRAP__PEER_URLS: "http://qdrant-primary:6335"
networks:
- qdrant-cluster
# 负载均衡
nginx:
image: nginx:latest
ports:
- "6300:80"
volumes:
- ./nginx-qdrant.conf:/etc/nginx/nginx.conf
depends_on:
- qdrant-primary
- qdrant-replica-1
- qdrant-replica-2
networks:
- qdrant-cluster
volumes:
qdrant-primary-data:
qdrant-replica1-data:
qdrant-replica2-data:
networks:
qdrant-cluster:5.2 向量库故障切换
package com.laozhang.ai.failover.vectorstore;
import io.github.resilience4j.circuitbreaker.annotation.CircuitBreaker;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* 向量库故障切换服务
* 支持主向量库故障时自动切换到备用向量库
*/
@Slf4j
@Service
@RequiredArgsConstructor
public class VectorStoreFailoverService {
private final VectorStore primaryVectorStore; // 主:Qdrant集群
private final VectorStore fallbackVectorStore; // 备:PGVector
/**
* 相似度搜索,自动故障切换
*/
@CircuitBreaker(name = "qdrant-primary", fallbackMethod = "similaritySearchFallback")
public List<Document> similaritySearch(String query, int topK) {
List<Document> results = primaryVectorStore.similaritySearch(query);
log.debug("主向量库搜索成功: query={}, results={}", query, results.size());
return results;
}
/**
* 主向量库不可用时,切换到备用向量库
*/
public List<Document> similaritySearchFallback(String query, int topK, Exception ex) {
log.warn("主向量库不可用,切换到备用向量库: error={}", ex.getMessage());
try {
return fallbackVectorStore.similaritySearch(query);
} catch (Exception e) {
log.error("备用向量库也不可用: error={}", e.getMessage());
return List.of(); // 返回空结果,上层处理降级
}
}
/**
* 写入向量(主+备同时写入,保证备用库数据同步)
*/
public void addDocuments(List<Document> documents) {
// 主库写入(必须成功)
primaryVectorStore.add(documents);
// 备库异步写入(可以失败,下次定期同步)
try {
fallbackVectorStore.add(documents);
} catch (Exception e) {
log.error("备用向量库写入失败,加入同步队列: error={}", e.getMessage());
// 实际实现中,这里应该将失败的写入加入消息队列,稍后重试
}
}
}六、跨区域部署架构
6.1 多AZ / 多Region AI服务部署
七、RTO/RPO指标设计
7.1 各故障场景的RTO/RPO目标
/**
* 各故障场景的恢复目标定义
* RTO: Recovery Time Objective 恢复时间目标
* RPO: Recovery Point Objective 恢复点目标(数据丢失上限)
*/
public class DisasterRecoveryObjectives {
public static final Map<FailureScenario, RecoveryObjective> OBJECTIVES = Map.of(
FailureScenario.PRIMARY_LLM_DOWN, new RecoveryObjective(
30, // RTO: 30秒(断路器自动切换)
0, // RPO: 0(无状态,无数据丢失)
"自动切换到备用LLM供应商",
RecoveryObjective.Level.AUTOMATIC
),
FailureScenario.ALL_CLOUD_LLM_DOWN, new RecoveryObjective(
300, // RTO: 5分钟(切换到本地模型)
0,
"切换到本地Ollama模型(能力降级)",
RecoveryObjective.Level.AUTOMATIC
),
FailureScenario.VECTOR_DB_DOWN, new RecoveryObjective(
60, // RTO: 1分钟(切换到备用向量库)
300, // RPO: 5分钟(最近5分钟的写入可能丢失)
"切换到PGVector备用向量库",
RecoveryObjective.Level.AUTOMATIC
),
FailureScenario.SINGLE_AZ_DOWN, new RecoveryObjective(
30, // RTO: 30秒(K8s自动重调度)
0,
"Kubernetes自动将Pod调度到其他AZ",
RecoveryObjective.Level.AUTOMATIC
),
FailureScenario.ENTIRE_REGION_DOWN, new RecoveryObjective(
900, // RTO: 15分钟(手动切换到灾备Region)
3600, // RPO: 1小时(异步复制延迟)
"手动切换DNS到灾备Region",
RecoveryObjective.Level.MANUAL
)
);
public enum FailureScenario {
PRIMARY_LLM_DOWN, ALL_CLOUD_LLM_DOWN, VECTOR_DB_DOWN,
SINGLE_AZ_DOWN, ENTIRE_REGION_DOWN
}
public record RecoveryObjective(
int rtoSeconds,
int rpoSeconds,
String recoveryStrategy,
Level level
) {
public enum Level { AUTOMATIC, SEMI_AUTOMATIC, MANUAL }
}
}八、故障演练:混沌工程配置
8.1 Chaos Monkey配置(模拟LLM故障)
package com.laozhang.ai.chaos;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Random;
/**
* AI服务混沌工程
* 定期模拟各种故障场景,验证故障切换机制
*
* 警告:只在非生产环境或演练窗口期启用!
*/
@Slf4j
@Component
@ConditionalOnProperty(name = "chaos.engineering.enabled", havingValue = "true")
public class AiChaosTesting {
private final LlmFailoverRouter router;
private final Random random = new Random();
public AiChaosTesting(LlmFailoverRouter router) {
this.router = router;
log.warn("混沌工程已启用!这只应在测试环境运行!");
}
/**
* 每天凌晨2点随机模拟一次LLM故障(仅测试环境)
*/
@Scheduled(cron = "0 0 2 * * ?")
public void simulateRandomLlmFailure() {
String[] scenarios = {
"primary-llm-timeout",
"primary-llm-5xx",
"primary-llm-down-30s",
"all-llm-down-60s"
};
String scenario = scenarios[random.nextInt(scenarios.length)];
log.warn("混沌测试启动: scenario={}", scenario);
switch (scenario) {
case "primary-llm-timeout" -> simulateTimeout("openai-gpt4o", 45);
case "primary-llm-5xx" -> simulateHttpError("openai-gpt4o", 503, 20);
case "primary-llm-down-30s" -> simulateFullOutage("openai-gpt4o", 30);
case "all-llm-down-60s" -> simulateAllProvidersDown(60);
}
}
private void simulateTimeout(String provider, int timeoutSeconds) {
log.warn("模拟 {} 超时 {} 秒", provider, timeoutSeconds);
// 通过注入延迟响应实现
}
private void simulateHttpError(String provider, int statusCode, int durationSeconds) {
log.warn("模拟 {} 返回 {} 错误 {} 秒", provider, statusCode, durationSeconds);
}
private void simulateFullOutage(String provider, int durationSeconds) {
log.warn("模拟 {} 完全宕机 {} 秒", provider, durationSeconds);
}
private void simulateAllProvidersDown(int durationSeconds) {
log.warn("模拟所有LLM供应商宕机 {} 秒", durationSeconds);
}
}8.2 混沌工程配置(application-chaos.yml)
# application-chaos.yml(仅在chaos profile激活时生效)
chaos:
engineering:
enabled: true
# 故障注入配置
experiments:
llm-failure:
enabled: true
# 1%的请求会模拟LLM超时
probability: 0.01
failure-type: TIMEOUT
timeout-duration: 60s
high-latency:
enabled: true
probability: 0.02
failure-type: LATENCY
added-latency: 5s
vector-db-failure:
enabled: true
probability: 0.005
failure-type: EXCEPTION
# 不要在生产环境启用这个profile!
spring:
profiles:
active: chaos # 通过环境变量控制九、灾难恢复演练Checklist
9.1 季度DR演练标准流程
/**
* 灾难恢复演练检查清单
* 每季度执行一次,确保DR能力持续有效
*/
public class DisasterRecoveryDrillChecklist {
public static final List<DrillItem> QUARTERLY_DRILL = List.of(
// === 准备阶段(演练前一周)===
DrillItem.prepare("DR-PREP-1", "通知所有相关团队演练时间和范围"),
DrillItem.prepare("DR-PREP-2", "准备演练环境,确保备用系统最新数据同步"),
DrillItem.prepare("DR-PREP-3", "准备回滚方案,确保演练失败时能快速恢复"),
DrillItem.prepare("DR-PREP-4", "更新联系人列表和升级路径"),
// === 演练执行阶段 ===
// 场景1:主LLM供应商故障切换
DrillItem.execute("DR-EXEC-1", "断开主LLM供应商(OpenAI)网络连接"),
DrillItem.execute("DR-EXEC-2", "验证断路器在5次失败内开启(<1分钟)"),
DrillItem.execute("DR-EXEC-3", "验证业务请求自动路由到备用供应商(Anthropic)"),
DrillItem.execute("DR-EXEC-4", "验证用户端响应正常,降级提示正确"),
DrillItem.execute("DR-EXEC-5", "记录切换耗时(目标:<30秒)"),
// 场景2:所有云端LLM不可用
DrillItem.execute("DR-EXEC-6", "同时断开所有云端LLM供应商"),
DrillItem.execute("DR-EXEC-7", "验证降级到本地Ollama模型"),
DrillItem.execute("DR-EXEC-8", "验证缓存响应正确返回"),
DrillItem.execute("DR-EXEC-9", "验证无法处理的请求正确引导人工"),
// 场景3:向量数据库故障
DrillItem.execute("DR-EXEC-10", "停止主Qdrant集群"),
DrillItem.execute("DR-EXEC-11", "验证RAG功能自动切换到PGVector"),
DrillItem.execute("DR-EXEC-12", "验证搜索结果的完整性(RPO验证)"),
// 场景4:整个可用区故障
DrillItem.execute("DR-EXEC-13", "删除主AZ所有Pod"),
DrillItem.execute("DR-EXEC-14", "验证K8s在其他AZ重新调度所有Pod(<2分钟)"),
DrillItem.execute("DR-EXEC-15", "验证所有端点恢复正常"),
// === 验证阶段 ===
DrillItem.verify("DR-VERIFY-1", "运行完整的功能测试套件(>95%通过率)"),
DrillItem.verify("DR-VERIFY-2", "检查监控大盘,确认无异常告警"),
DrillItem.verify("DR-VERIFY-3", "验证日志完整性,无丢失"),
DrillItem.verify("DR-VERIFY-4", "确认数据一致性(对比主备库数据)"),
// === 恢复阶段 ===
DrillItem.recover("DR-RECOVER-1", "恢复主LLM供应商连接"),
DrillItem.recover("DR-RECOVER-2", "等待断路器半开状态探测成功,自动切回"),
DrillItem.recover("DR-RECOVER-3", "验证所有指标恢复正常水平"),
// === 复盘阶段 ===
DrillItem.review("DR-REVIEW-1", "记录所有阶段耗时,与目标对比"),
DrillItem.review("DR-REVIEW-2", "识别改进点,提交工单跟进"),
DrillItem.review("DR-REVIEW-3", "更新DR文档和Runbook"),
DrillItem.review("DR-REVIEW-4", "将演练结果发送给管理层")
);
public record DrillItem(
String id,
String description,
Phase phase,
boolean completed,
String notes
) {
public enum Phase { PREPARE, EXECUTE, VERIFY, RECOVER, REVIEW }
public static DrillItem prepare(String id, String desc) {
return new DrillItem(id, desc, Phase.PREPARE, false, "");
}
public static DrillItem execute(String id, String desc) {
return new DrillItem(id, desc, Phase.EXECUTE, false, "");
}
public static DrillItem verify(String id, String desc) {
return new DrillItem(id, desc, Phase.VERIFY, false, "");
}
public static DrillItem recover(String id, String desc) {
return new DrillItem(id, desc, Phase.RECOVER, false, "");
}
public static DrillItem review(String id, String desc) {
return new DrillItem(id, desc, Phase.REVIEW, false, "");
}
}
}十、性能数据:故障切换实测数据
在生产环境连续运行3个月,共发生 17次 LLM供应商故障(包括演练):
| 故障类型 | 发生次数 | 平均检测时间 | 平均切换时间 | 业务中断时间 |
|---|---|---|---|---|
| 主LLM超时 | 8次 | 1.2秒 | 2.1秒 | 0秒(无感知) |
| 主LLM 5xx错误 | 5次 | 0.8秒 | 1.5秒 | 0秒(无感知) |
| 主LLM完全宕机 | 3次 | 3.5秒 | 8.2秒 | 平均2.1秒 |
| 所有云端LLM宕机 | 1次 | 45秒 | 63秒 | 63秒 |
SLA达成情况:
- 目标:99.99%(每月允许停机 4.3分钟)
- 实际:99.9972%(每月实际停机 约0.8分钟)
- 超额完成目标!
降级效果:
- 语义缓存命中率:23.4%(降级期间)
- 规则引擎覆盖率:61.2%(风控场景)
- 最终引导人工:15.4%
十一、FAQ
Q1:多供应商方案成本会不会增加很多? A:主要成本在备用供应商的维护(API Key + 少量测试流量)。实际上,备用供应商只在故障时消耗,正常流量100%走主供应商。额外成本约为总API费用的2-5%,但保障了99.99%的可用性。
Q2:本地Ollama模型效果比云端差很多,降级后用户体验会大幅下降吗? A:这取决于业务场景。对于简单的客服问答,Llama-3-8B已经够用。对于复杂的分析任务,确实有明显差距。建议:(1) 降级时告知用户;(2) 将无法由小模型处理的复杂请求排队,等待主模型恢复。
Q3:断路器的阈值如何确定? A:根据你的LLM供应商的历史SLA数据来设置。一般来说:失败率阈值50%(超过一半失败才开路),这样能在真实故障时快速切换,又不会因为偶发超时误触发。
Q4:语义缓存如何保证数据安全(不同用户的响应不混用)? A:语义缓存只适用于非个性化、非隐私的响应(如FAQ、通用说明)。对于包含用户特定数据的响应,应该使用精确匹配缓存(按userId隔离)或者不缓存。
Q5:DR演练会影响线上用户吗? A:建议DR演练在业务低峰期(凌晨2-4点)进行,并在流量最小的节点上演练。也可以先在预发环境演练,验证方案可行后再在生产环境执行。
十二、完整的DR监控大盘设计
12.1 关键监控指标(Prometheus + Grafana)
package com.laozhang.ai.monitoring;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import java.util.concurrent.atomic.AtomicInteger;
/**
* AI服务灾难恢复监控指标
* 为Grafana大盘提供关键指标
*/
@Component
@RequiredArgsConstructor
public class AiDrMetrics {
private final MeterRegistry meterRegistry;
// 当前活跃供应商(0=主供应商,1=第一备用,以此类推)
private final AtomicInteger currentProviderLevel = new AtomicInteger(0);
// 当前处于降级模式的请求百分比
private final AtomicInteger degradationPercentage = new AtomicInteger(0);
public void recordProviderSwitch(String fromProvider, String toProvider, String reason) {
Counter.builder("ai.provider.switch")
.tag("from", fromProvider)
.tag("to", toProvider)
.tag("reason", reason)
.register(meterRegistry)
.increment();
}
public void recordFailoverLatency(String provider, long latencyMs) {
Timer.builder("ai.failover.latency")
.tag("provider", provider)
.register(meterRegistry)
.record(java.time.Duration.ofMillis(latencyMs));
}
public void updateProviderHealth(String provider, boolean healthy) {
Gauge.builder("ai.provider.health", () -> healthy ? 1.0 : 0.0)
.tag("provider", provider)
.register(meterRegistry);
}
public void recordDegradationEvent(String scene, String degradationType) {
Counter.builder("ai.degradation.events")
.tag("scene", scene)
.tag("type", degradationType)
.register(meterRegistry)
.increment();
}
/**
* 计算并更新可用性指标(用于SLA报告)
*/
public void updateAvailabilityMetrics(
long totalRequestsInPeriod,
long failedRequestsInPeriod
) {
double availability = totalRequestsInPeriod > 0
? (double)(totalRequestsInPeriod - failedRequestsInPeriod) / totalRequestsInPeriod
: 1.0;
Gauge.builder("ai.service.availability", () -> availability)
.description("AI服务可用性(0-1)")
.register(meterRegistry);
}
}12.2 自动SLA报告生成
package com.laozhang.ai.monitoring;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
/**
* 自动SLA报告生成服务
* 每月自动生成可用性报告,发送给管理层
*/
@Slf4j
@Service
@RequiredArgsConstructor
public class SlaReportService {
private final AiRequestMetricsRepository metricsRepository;
private final AlertNotificationService alertService;
/**
* 每月1日凌晨1点生成上月SLA报告
*/
@Scheduled(cron = "0 0 1 1 * ?")
public void generateMonthlySlaReport() {
LocalDateTime lastMonthStart = LocalDateTime.now().minusMonths(1).withDayOfMonth(1)
.withHour(0).withMinute(0).withSecond(0);
LocalDateTime lastMonthEnd = LocalDateTime.now().withDayOfMonth(1)
.withHour(0).withMinute(0).withSecond(0);
long totalRequests = metricsRepository.countInPeriod(lastMonthStart, lastMonthEnd);
long failedRequests = metricsRepository.countFailedInPeriod(lastMonthStart, lastMonthEnd);
long degradedRequests = metricsRepository.countDegradedInPeriod(lastMonthStart, lastMonthEnd);
double availability = totalRequests > 0
? (double)(totalRequests - failedRequests) / totalRequests
: 1.0;
double availabilityNines = Math.floor(Math.log10(1.0 / (1.0 - availability)) * 10) / 10;
// 计算总停机时间(分钟)
long downtimeMinutes = metricsRepository.sumDowntimeMinutes(lastMonthStart, lastMonthEnd);
String report = """
📊 AI服务月度SLA报告
时间范围:%s 至 %s
══════════════════════════════
总请求量:%,d 次
成功请求:%,d 次
失败请求:%,d 次
降级请求:%,d 次(功能受限但有响应)
──────────────────────────────
可用性:%.4f%% (%s个9)
总停机时间:%d 分钟
SLA目标(99.99%%):%s
══════════════════════════════
故障切换统计:
- 主→备供应商切换:%d 次
- 语义缓存降级:%d 次
- 规则引擎降级:%d 次
- 人工处理兜底:%d 次
""".formatted(
lastMonthStart.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")),
lastMonthEnd.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")),
totalRequests, totalRequests - failedRequests, failedRequests, degradedRequests,
availability * 100,
String.format("%.1f", availabilityNines),
downtimeMinutes,
availability >= 0.9999 ? "✅ 达标" : "❌ 未达标(需要复盘)",
metricsRepository.countProviderSwitches(lastMonthStart, lastMonthEnd),
metricsRepository.countCacheDegradations(lastMonthStart, lastMonthEnd),
metricsRepository.countRuleDegradations(lastMonthStart, lastMonthEnd),
metricsRepository.countManualFallbacks(lastMonthStart, lastMonthEnd)
);
log.info("月度SLA报告生成完成:\n{}", report);
// 发送给管理层
alertService.sendEmailToManagement("AI服务月度SLA报告", report);
// 如果不达标,触发特殊告警
if (availability < 0.9999) {
alertService.sendCriticalAlert("⚠️ 本月AI服务可用性未达到99.99%目标,请安排复盘会议!");
}
}
}12.3 Grafana大盘关键面板配置
{
"dashboard": {
"title": "AI服务灾难恢复监控大盘",
"refresh": "30s",
"panels": [
{
"title": "当前供应商状态",
"type": "stat",
"targets": [
{"expr": "ai_provider_health{provider='openai-gpt4o'}"},
{"expr": "ai_provider_health{provider='anthropic-claude'}"},
{"expr": "ai_provider_health{provider='qwen-max'}"}
],
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": 0}, {"color": "green", "value": 1}]}
},
{
"title": "实时可用性(SLA监控)",
"type": "gauge",
"targets": [{"expr": "ai_service_availability * 100"}],
"thresholds": {"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 99.9},
{"color": "green", "value": 99.99}
]},
"min": 99,
"max": 100
},
{
"title": "供应商切换事件(30分钟滚动)",
"type": "timeseries",
"targets": [{"expr": "increase(ai_provider_switch_total[30m]) by (from, to)"}]
},
{
"title": "降级请求占比",
"type": "piechart",
"targets": [
{"expr": "sum(ai_degradation_events_total{type='SEMANTIC_CACHE'})", "legendFormat": "语义缓存"},
{"expr": "sum(ai_degradation_events_total{type='RULE_ENGINE'})", "legendFormat": "规则引擎"},
{"expr": "sum(ai_degradation_events_total{type='FAQ'})", "legendFormat": "FAQ降级"},
{"expr": "sum(ai_degradation_events_total{type='FALLBACK'})", "legendFormat": "兜底降级"}
]
},
{
"title": "故障切换延迟分布(P50/P95/P99)",
"type": "timeseries",
"targets": [
{"expr": "histogram_quantile(0.50, rate(ai_failover_latency_seconds_bucket[5m]))", "legendFormat": "P50"},
{"expr": "histogram_quantile(0.95, rate(ai_failover_latency_seconds_bucket[5m]))", "legendFormat": "P95"},
{"expr": "histogram_quantile(0.99, rate(ai_failover_latency_seconds_bucket[5m]))", "legendFormat": "P99"}
]
}
]
}
}结语
赵磊的团队在那次故障后,花了2个月时间重建了整套AI灾难恢复体系。
他后来说:"以前我觉得我们的系统很稳定,直到它宕了。现在我觉得我们的系统能抗任何故障,因为我们每个月都在主动制造故障来验证。"
这就是混沌工程的精髓:与其被动等待故障发生,不如主动制造可控的故障,在造成真正损失之前修复一切隐患。
AI应用的高可用性,不是买一个贵的供应商就能解决的。它需要你在每一层——供应商、缓存、降级、监控——都做好准备。
