第1958篇：AI应用的告警策略设计——区分模型问题和工程问题的告警分类

老张2026/4/30大约 10 分钟

第1958篇：AI应用的告警策略设计——区分模型问题和工程问题的告警分类

告警太多是比没有告警更麻烦的问题。

某个团队配置了200多条告警规则，值班人员每天收到300-400条通知，大多数是"LLM响应时间超过5000ms"。刚开始大家还会去看，两个月后就完全开始忽视了，那个收件箱成了垃圾箱。

然后有一天，一条真正重要的告警——"所有用户的回答都返回了空字符串"——就淹没在噪音里，整整两个小时没有人处理。

告警疲劳是运维领域老生常谈的问题，但在AI系统里，它有一个额外的维度：告警往往没有区分"模型问题"和"工程问题"，所有异常都混在一起，导致即使看了告警也不知道该怎么处理。

今天讲告警策略设计，核心是这个分类框架。

问题分类：模型问题 vs 工程问题

先建立分类体系。这两类问题的处理方式完全不同，不能混为一谈。

模型问题：系统工程上运行正常，但AI给出的回答在质量或内容上有问题。处理这类问题通常需要Prompt工程师或者算法工程师来分析，不是运维直接能修的。

工程问题：系统本身出了问题，跟AI模型关系不大。这类问题是传统运维擅长处理的，重启、扩容、切换等操作就能解决。

混淆这两类的后果：运维拿到一个"LLM回答质量下降"的告警，不知道该干什么；算法工程师拿到一个"向量数据库连接池耗尽"的告警，也不知道该干什么。

告警优先级矩阵

public enum AlertPriority {
    P0_CRITICAL,    // 立即处理，分钟级响应，影响全部用户
    P1_HIGH,        // 1小时内处理，影响部分用户或核心功能
    P2_MEDIUM,      // 工作时间内处理，影响用户体验但不影响功能
    P3_LOW          // 记录在案，下次迭代中处理
}

// 告警类型和优先级的映射表
public class AlertPriorityMatrix {
    
    private static final Map<AlertType, AlertPriority> PRIORITY_MAP = Map.ofEntries(
        // 工程问题 - 高优先级
        Map.entry(AlertType.LLM_API_COMPLETELY_DOWN, AlertPriority.P0_CRITICAL),
        Map.entry(AlertType.ERROR_RATE_OVER_50PCT, AlertPriority.P0_CRITICAL),
        Map.entry(AlertType.ALL_RESPONSES_EMPTY, AlertPriority.P0_CRITICAL),
        Map.entry(AlertType.TOKEN_QUOTA_EXHAUSTED, AlertPriority.P1_HIGH),
        Map.entry(AlertType.VECTOR_DB_UNAVAILABLE, AlertPriority.P1_HIGH),
        Map.entry(AlertType.P99_LATENCY_OVER_30S, AlertPriority.P1_HIGH),
        Map.entry(AlertType.ERROR_RATE_OVER_10PCT, AlertPriority.P1_HIGH),
        Map.entry(AlertType.CIRCUIT_BREAKER_OPENED, AlertPriority.P1_HIGH),
        
        // 工程问题 - 中等优先级
        Map.entry(AlertType.P95_LATENCY_DEGRADED, AlertPriority.P2_MEDIUM),
        Map.entry(AlertType.RATE_LIMIT_FREQUENTLY_HIT, AlertPriority.P2_MEDIUM),
        Map.entry(AlertType.QUEUE_BACKLOG_GROWING, AlertPriority.P2_MEDIUM),
        
        // 模型问题 - 高优先级
        Map.entry(AlertType.HARMFUL_CONTENT_DETECTED, AlertPriority.P0_CRITICAL),
        Map.entry(AlertType.MASS_HALLUCINATION_DETECTED, AlertPriority.P1_HIGH),
        Map.entry(AlertType.REFUSAL_RATE_SPIKE, AlertPriority.P1_HIGH),
        
        // 模型问题 - 中等优先级
        Map.entry(AlertType.QUALITY_SCORE_SIGNIFICANT_DROP, AlertPriority.P2_MEDIUM),
        Map.entry(AlertType.FORMAT_COMPLIANCE_DROP, AlertPriority.P2_MEDIUM),
        Map.entry(AlertType.USER_SATISFACTION_DECLINE, AlertPriority.P2_MEDIUM),
        
        // 模型问题 - 低优先级（趋势性问题）
        Map.entry(AlertType.GRADUAL_QUALITY_DEGRADATION, AlertPriority.P3_LOW),
        Map.entry(AlertType.RETRIEVAL_QUALITY_BELOW_BASELINE, AlertPriority.P3_LOW)
    );
}

告警规则的精细化配置

@Service
public class AlertRuleEngine {
    
    private final MetricsRepository metricsRepo;
    private final AlertNotifier notifier;
    private final AlertDeduplicator deduplicator;
    
    /**
     * 工程问题告警：基于时序数据的阈值判断
     */
    @Scheduled(fixedRate = 30_000) // 每30秒执行
    public void checkEngineeringAlerts() {
        
        // 1. LLM API可用性检查
        double llmSuccessRate = metricsRepo.getLLMSuccessRate(Duration.ofMinutes(5));
        if (llmSuccessRate < 0.50) {
            fireAlert(AlertType.LLM_API_COMPLETELY_DOWN,
                Map.of("success_rate", llmSuccessRate,
                       "window", "5分钟"),
                AlertPriority.P0_CRITICAL);
        }
        
        // 2. 延迟告警（分场景）
        Map<String, Long> p99ByScenario = metricsRepo.getP99LatencyByScenario(Duration.ofMinutes(10));
        for (Map.Entry<String, Long> entry : p99ByScenario.entrySet()) {
            String scenario = entry.getKey();
            long p99 = entry.getValue();
            long baseline = getLatencyBaseline(scenario);
            
            // 只有当延迟超过基线的2倍且绝对值也超过阈值时才告警
            // 这样可以避免在低流量时期的误报
            if (p99 > baseline * 2 && p99 > 10_000) {
                fireAlert(AlertType.P99_LATENCY_DEGRADED,
                    Map.of("scenario", scenario,
                           "p99_ms", p99,
                           "baseline_ms", baseline),
                    AlertPriority.P2_MEDIUM);
            }
        }
        
        // 3. Token配额告警（提前预警）
        TokenQuotaStatus quotaStatus = metricsRepo.getTokenQuotaStatus();
        if (quotaStatus.getRemainingPercent() < 0.10) {
            fireAlert(AlertType.TOKEN_QUOTA_EXHAUSTED,
                Map.of("remaining_percent", quotaStatus.getRemainingPercent(),
                       "estimated_exhaust_time", quotaStatus.getEstimatedExhaustTime()),
                AlertPriority.P1_HIGH);
        } else if (quotaStatus.getRemainingPercent() < 0.25) {
            fireAlert(AlertType.TOKEN_QUOTA_LOW,
                Map.of("remaining_percent", quotaStatus.getRemainingPercent()),
                AlertPriority.P2_MEDIUM);
        }
    }
    
    /**
     * 模型质量告警：基于AI指标的统计判断
     * 注意：这些指标本身有噪声，需要更大的统计窗口才能可靠
     */
    @Scheduled(fixedRate = 300_000) // 每5分钟执行，窗口更大
    public void checkModelQualityAlerts() {
        
        // 1. 幻觉率突变检测
        double currentHallucinationRate = metricsRepo.getHallucinationRate(Duration.ofHours(1));
        double baselineHallucinationRate = metricsRepo.getBaselineHallucinationRate();
        
        if (currentHallucinationRate > baselineHallucinationRate * 2 &&
            currentHallucinationRate > 0.15) { // 至少15%才告警
            
            fireAlert(AlertType.MASS_HALLUCINATION_DETECTED,
                Map.of("current_rate", currentHallucinationRate,
                       "baseline_rate", baselineHallucinationRate,
                       "increase_ratio", currentHallucinationRate / baselineHallucinationRate),
                AlertPriority.P1_HIGH);
        }
        
        // 2. 拒绝回答率突变（可能是安全过滤异常，或者Prompt被注入）
        double refusalRate = metricsRepo.getRefusalRate(Duration.ofHours(1));
        double baselineRefusalRate = metricsRepo.getBaselineRefusalRate();
        
        if (refusalRate > baselineRefusalRate * 3) {
            // 拒绝率突然变成基线的3倍，很可能有问题
            fireAlert(AlertType.REFUSAL_RATE_SPIKE,
                Map.of("current_rate", refusalRate,
                       "baseline_rate", baselineRefusalRate),
                AlertPriority.P1_HIGH);
        }
        
        // 3. 综合质量得分下降
        double avgQualityScore = metricsRepo.getAvgQualityScore(Duration.ofHours(2));
        double qualityBaseline = metricsRepo.getQualityBaseline();
        
        // 质量下降超过15%，触发告警
        if (avgQualityScore < qualityBaseline * 0.85) {
            fireAlert(AlertType.QUALITY_SCORE_SIGNIFICANT_DROP,
                Map.of("current_score", avgQualityScore,
                       "baseline_score", qualityBaseline,
                       "drop_percent", 
                       (1 - avgQualityScore / qualityBaseline) * 100),
                AlertPriority.P2_MEDIUM);
        }
        
        // 4. 格式合规率下降（通常是Prompt版本出了问题）
        double formatComplianceRate = metricsRepo.getFormatComplianceRate(Duration.ofHours(1));
        if (formatComplianceRate < 0.85) { // 低于85%
            
            // 附带上最近的Prompt版本信息，帮助排查
            String currentPromptVersion = getCurrentPromptVersion();
            
            fireAlert(AlertType.FORMAT_COMPLIANCE_DROP,
                Map.of("compliance_rate", formatComplianceRate,
                       "current_prompt_version", currentPromptVersion,
                       "suggestion", "检查最近的Prompt版本变更"),
                AlertPriority.P2_MEDIUM);
        }
    }
    
    /**
     * 告警去重：同一类型的告警短时间内只发一次
     */
    private void fireAlert(AlertType type, Map<String, Object> context, 
                            AlertPriority priority) {
        AlertKey key = AlertKey.of(type, extractGroupingKey(context));
        
        // 告警去重窗口：P0是5分钟，P1是30分钟，P2是2小时
        Duration deduplicationWindow = switch (priority) {
            case P0_CRITICAL -> Duration.ofMinutes(5);
            case P1_HIGH -> Duration.ofMinutes(30);
            case P2_MEDIUM -> Duration.ofHours(2);
            case P3_LOW -> Duration.ofHours(24);
        };
        
        if (deduplicator.shouldFire(key, deduplicationWindow)) {
            Alert alert = Alert.builder()
                .type(type)
                .category(type.getCategory()) // MODEL or ENGINEERING
                .priority(priority)
                .context(context)
                .firedAt(Instant.now())
                .runbook(getRunbookUrl(type)) // 附带处理手册链接
                .build();
            
            notifier.send(alert);
            deduplicator.record(key);
        }
    }
}

告警内容的设计

一个好的告警通知，应该让接收者在15秒内理解：发生了什么、有多严重、应该做什么。

@Service
public class AlertMessageFormatter {
    
    public String formatForSlack(Alert alert) {
        StringBuilder sb = new StringBuilder();
        
        // 严重程度标识
        String emoji = switch (alert.getPriority()) {
            case P0_CRITICAL -> ":rotating_light:";
            case P1_HIGH -> ":warning:";
            case P2_MEDIUM -> ":information_source:";
            case P3_LOW -> ":memo:";
        };
        
        // 告警类别标识
        String categoryTag = alert.getCategory() == AlertCategory.MODEL ? 
            "[模型问题]" : "[工程问题]";
        
        sb.append(emoji).append(" **").append(categoryTag).append("** ")
          .append(alert.getType().getDisplayName()).append("\n\n");
        
        // 核心指标
        sb.append("**当前状态：**\n");
        for (Map.Entry<String, Object> entry : alert.getContext().entrySet()) {
            sb.append("• ").append(formatKey(entry.getKey()))
              .append("：").append(formatValue(entry.getValue())).append("\n");
        }
        
        // 影响范围（如果可以估算）
        if (alert.getEstimatedImpact() != null) {
            sb.append("\n**影响范围：**\n");
            sb.append("• 受影响用户：").append(alert.getEstimatedImpact().getAffectedUsers())
              .append("\n");
            sb.append("• 受影响请求/分钟：").append(alert.getEstimatedImpact().getRequestsPerMin())
              .append("\n");
        }
        
        // 处理建议
        sb.append("\n**处理建议：**\n");
        sb.append(getRecommendedActions(alert)).append("\n");
        
        // 运维手册链接
        if (alert.getRunbook() != null) {
            sb.append("\n📖 [处理手册](").append(alert.getRunbook()).append(")");
        }
        
        // 快速操作按钮（Slack Block Kit格式，这里简化）
        sb.append("\n\n[查看监控面板] [查看相关日志] [静默此告警(1小时)]");
        
        return sb.toString();
    }
    
    private String getRecommendedActions(Alert alert) {
        return switch (alert.getType()) {
            case LLM_API_COMPLETELY_DOWN -> 
                "1. 检查LLM API状态页\n" +
                "2. 确认是否触发限流（查看429错误率）\n" +
                "3. 如果是服务商问题，切换到备用LLM端点\n" +
                "4. 打开熔断器，防止雪崩";
                
            case TOKEN_QUOTA_EXHAUSTED ->
                "1. 立即申请临时配额提升\n" +
                "2. 检查最近是否有Token用量异常的业务请求\n" +
                "3. 考虑临时降低非核心业务的并发上限\n" +
                "4. 评估是否需要触发降级模式";
                
            case MASS_HALLUCINATION_DETECTED ->
                "1. 检查最近是否有Prompt版本变更\n" +
                "2. 如果有，立即回滚到上一个Prompt版本\n" +
                "3. 检查向量检索质量是否下降\n" +
                "4. 通知产品团队，人工审核最近1小时的回答";
                
            case REFUSAL_RATE_SPIKE ->
                "1. 检查是否有Prompt注入攻击\n" +
                "2. 查看被拒绝的请求样本，分析拒绝原因\n" +
                "3. 检查模型安全过滤器配置是否有变更\n" +
                "4. 如果是系统性问题，考虑临时降低安全过滤敏感度";
                
            default -> "查看运维手册：" + alert.getRunbook();
        };
    }
}

告警静默和升级机制

@Service
public class AlertEscalationService {
    
    private final AlertRepository alertRepo;
    private final OnCallSchedule onCallSchedule;
    
    /**
     * 告警升级：如果P1告警30分钟内没有被确认，升级为P0
     */
    @Scheduled(fixedRate = 60_000)
    public void checkEscalations() {
        
        // 找出已发出但未被确认的高优先级告警
        List<Alert> unacknowledged = alertRepo.findUnacknowledged(
            AlertPriority.P1_HIGH,
            Duration.ofMinutes(30)
        );
        
        for (Alert alert : unacknowledged) {
            log.warn("告警升级: alertId={}, type={}, duration={}min",
                alert.getId(), alert.getType(),
                alert.getAge().toMinutes());
            
            // 升级通知：发给更高级别的人员
            OnCallEngineer senior = onCallSchedule.getSeniorOnCall();
            notifier.sendEscalation(alert, senior, 
                "告警已超过30分钟未被处理，请紧急介入");
            
            alert.escalate();
            alertRepo.save(alert);
        }
    }
    
    /**
     * 智能静默：业务低峰期降低某些指标的告警灵敏度
     * 比如凌晨2-6点，偶发的延迟波动不需要叫醒人
     */
    public boolean shouldSilence(Alert alert) {
        LocalTime now = LocalTime.now();
        boolean isOffPeak = now.isAfter(LocalTime.of(2, 0)) && 
                            now.isBefore(LocalTime.of(7, 0));
        
        if (!isOffPeak) return false;
        
        // 低峰期P2以下告警静默
        if (alert.getPriority().ordinal() >= AlertPriority.P2_MEDIUM.ordinal()) {
            return true;
        }
        
        // 低峰期，只有当前活跃用户超过一定数量才发P1告警
        // （凌晨基本没人用，延迟高10秒其实没什么影响）
        long activeUsers = metricsRepo.getActiveUserCount(Duration.ofMinutes(15));
        if (alert.getPriority() == AlertPriority.P1_HIGH && activeUsers < 10) {
            return true;
        }
        
        return false;
    }
}

告警回顾机制

告警体系本身也需要被监控和优化。

@Service
public class AlertQualityReview {
    
    /**
     * 每周自动生成告警质量报告
     * 这是对抗"告警疲劳"的关键手段
     */
    @Scheduled(cron = "0 0 9 * * MON") // 每周一上午9点
    public void generateWeeklyReview() {
        LocalDateTime weekStart = LocalDateTime.now().minusWeeks(1);
        LocalDateTime weekEnd = LocalDateTime.now();
        
        List<Alert> weekAlerts = alertRepo.findByTimeRange(weekStart, weekEnd);
        
        // 1. 告警量统计
        Map<AlertType, Long> countByType = weekAlerts.stream()
            .collect(Collectors.groupingBy(Alert::getType, Collectors.counting()));
        
        // 2. 误报分析（被立即静默或者标记为误报的告警）
        List<Alert> falsePositives = weekAlerts.stream()
            .filter(a -> a.getResolution() == AlertResolution.FALSE_POSITIVE)
            .collect(Collectors.toList());
        
        double falsePositiveRate = (double) falsePositives.size() / weekAlerts.size();
        
        // 3. 响应时间分析
        OptionalDouble avgAckTime = weekAlerts.stream()
            .filter(a -> a.getAcknowledgedAt() != null)
            .mapToLong(a -> Duration.between(a.getFiredAt(), 
                a.getAcknowledgedAt()).toMinutes())
            .average();
        
        // 4. 找出最吵的告警（频率最高但影响最低的）
        List<AlertType> noisyAlerts = identifyNoisyAlerts(weekAlerts);
        
        WeeklyAlertReport report = WeeklyAlertReport.builder()
            .period(weekStart + " ~ " + weekEnd)
            .totalAlerts(weekAlerts.size())
            .falsePositiveRate(falsePositiveRate)
            .avgAckTimeMinutes(avgAckTime.orElse(0))
            .countByType(countByType)
            .noisyAlerts(noisyAlerts)
            .recommendations(generateRecommendations(
                falsePositiveRate, noisyAlerts, avgAckTime.orElse(0)))
            .build();
        
        // 发给团队
        notifier.sendWeeklyReport(report);
    }
    
    private List<String> generateRecommendations(double fpRate, 
                                                   List<AlertType> noisy,
                                                   double avgAckTime) {
        List<String> recs = new ArrayList<>();
        
        if (fpRate > 0.30) {
            recs.add(String.format("误报率过高(%.0f%%)，建议审查阈值设置，" +
                "考虑提高告警触发条件", fpRate * 100));
        }
        
        if (!noisy.isEmpty()) {
            recs.add("以下告警频率高但被频繁静默，建议调整阈值或降低优先级：" + 
                noisy.stream().map(AlertType::getDisplayName)
                    .collect(Collectors.joining(", ")));
        }
        
        if (avgAckTime > 60) {
            recs.add(String.format("平均响应时间过长(%.0f分钟)，" +
                "建议优化值班流程或通知渠道", avgAckTime));
        }
        
        return recs;
    }
}

我的几条经验

经验1：告警数量要控制

我的经验是，一个系统的告警数量每天不应该超过10条。超过这个数字，值班人员就会开始选择性忽视。如果你的系统每天100条告警，说明你的告警策略需要根本性的重新设计，而不是往里加更多规则。

经验2：区分"同比"和"绝对值"

纯绝对值告警容易产生误报。"延迟超过5000ms"这个告警，在凌晨低流量时完全正常，在下午高峰期才是问题。改用同比基线的变化量作为判断依据，误报率会大幅下降。

经验3：模型问题告警要附带诊断信息

工程问题告警，看到错误码就知道怎么处理。但模型问题告警，需要附带样本案例——"幻觉率上升"这条告警，附带上5个典型的幻觉案例，算法工程师才能快速定位是什么类型的幻觉在增加。

经验4：建立告警的"黄金case库"

某些特定类型的告警，应该附带历史上类似情况的处理记录。"这个告警上次是因为Prompt版本升级触发的，最后的处理方式是xxx"——这种历史知识积累，能大幅降低处理时间。

告警策略设计的终极目标，是让每一条告警都能在接收者的脑子里触发一个明确的"接下来应该做什么"。

如果你的告警不能让处理人员在15秒内判断出下一步行动，那这条告警需要重新设计。