第2338篇：Java AI的错误处理架构——优雅处理LLM调用异常的工程设计

老张2026/4/30大约 6 分钟

第2338篇：Java AI的错误处理架构——优雅处理LLM调用异常的工程设计

适读人群：开发Java AI服务的工程师，希望提升AI应用健壮性和用户体验的开发者 | 阅读时长：约17分钟 | 核心价值：建立分层的AI异常处理架构，实现智能降级和用户友好的错误反馈

LLM调用的异常比普通HTTP调用复杂得多。不是简单的网络错误，还有：

内容安全过滤（模型拒绝回答）
Token限制超出
上下文窗口溢出
模型幻觉返回了格式错误的JSON
工具调用死循环
API配额耗尽

每种错误对用户的影响和处理方式都不一样，不能统一用一个500错误了事。

异常分类体系

先把AI应用的异常分类清楚，才能对症下药：

// AI异常的顶层基类
public abstract class AiException extends RuntimeException {
    private final AiErrorCode code;
    private final boolean retryable;    // 是否可以重试
    private final boolean userVisible;  // 错误信息是否对用户可见
    
    protected AiException(AiErrorCode code, String message, boolean retryable, boolean userVisible) {
        super(message);
        this.code = code;
        this.retryable = retryable;
        this.userVisible = userVisible;
    }
    
    protected AiException(AiErrorCode code, String message, Throwable cause, 
                          boolean retryable, boolean userVisible) {
        super(message, cause);
        this.code = code;
        this.retryable = retryable;
        this.userVisible = userVisible;
    }
}

// 错误码枚举
public enum AiErrorCode {
    // 可重试的错误
    RATE_LIMITED("AI_001", "请求频率超限"),
    SERVICE_TIMEOUT("AI_002", "AI服务响应超时"),
    SERVICE_UNAVAILABLE("AI_003", "AI服务暂时不可用"),
    
    // 不可重试的错误（参数问题）
    CONTEXT_TOO_LONG("AI_004", "输入内容过长"),
    CONTENT_FILTERED("AI_005", "内容被安全过滤"),
    INVALID_REQUEST("AI_006", "无效的请求参数"),
    
    // 系统错误（需要人工介入）
    AUTHENTICATION_FAILED("AI_007", "AI服务认证失败"),
    OUTPUT_PARSE_FAILED("AI_008", "AI输出解析失败"),
    TOOL_EXECUTION_FAILED("AI_009", "工具调用执行失败"),
    QUOTA_EXCEEDED("AI_010", "API配额已耗尽");
    
    public final String code;
    public final String defaultMessage;
    
    AiErrorCode(String code, String defaultMessage) {
        this.code = code;
        this.defaultMessage = defaultMessage;
    }
}

// 具体异常类
public class RateLimitedException extends AiException {
    private final int retryAfterSeconds;
    
    public RateLimitedException(int retryAfterSeconds) {
        super(AiErrorCode.RATE_LIMITED, 
              "请求频率超限，请在" + retryAfterSeconds + "秒后重试",
              true, true);
        this.retryAfterSeconds = retryAfterSeconds;
    }
    
    public int getRetryAfterSeconds() { return retryAfterSeconds; }
}

public class ContentFilteredException extends AiException {
    private final String filterReason;
    
    public ContentFilteredException(String filterReason) {
        super(AiErrorCode.CONTENT_FILTERED,
              "内容不符合安全要求：" + filterReason,
              false, true);
        this.filterReason = filterReason;
    }
}

public class ContextTooLongException extends AiException {
    private final int currentTokens;
    private final int maxTokens;
    
    public ContextTooLongException(int currentTokens, int maxTokens) {
        super(AiErrorCode.CONTEXT_TOO_LONG,
              String.format("输入内容过长：%d tokens，最大允许%d tokens", currentTokens, maxTokens),
              false, true);
        this.currentTokens = currentTokens;
        this.maxTokens = maxTokens;
    }
}

public class OutputParseException extends AiException {
    private final String rawOutput;
    
    public OutputParseException(String rawOutput, Throwable cause) {
        super(AiErrorCode.OUTPUT_PARSE_FAILED,
              "AI输出解析失败",
              cause, false, false);  // 不对用户展示技术细节
        this.rawOutput = rawOutput;
    }
    
    public String getRawOutput() { return rawOutput; }
}

异常转换层：把底层异常转成业务异常

@Component
@Slf4j
public class LlmExceptionTranslator {
    
    /**
     * 把Spring AI/底层HTTP客户端的异常转换为业务异常
     */
    public AiException translate(Exception e) {
        String message = e.getMessage();
        if (message == null) message = e.getClass().getSimpleName();
        
        // 429 Too Many Requests
        if (message.contains("429") || message.contains("rate limit") || 
                message.contains("Rate limit")) {
            int retryAfter = extractRetryAfter(message);
            return new RateLimitedException(retryAfter);
        }
        
        // 401/403 认证失败
        if (message.contains("401") || message.contains("403") ||
                message.contains("Unauthorized") || message.contains("API key")) {
            log.error("LLM API认证失败，请检查API Key配置");
            return new AiException(AiErrorCode.AUTHENTICATION_FAILED,
                    "AI服务认证配置有误，请联系管理员", false, false) {};
        }
        
        // 内容过滤
        if (message.contains("content_filter") || message.contains("safety") ||
                message.contains("违规") || message.contains("inappropriate")) {
            return new ContentFilteredException("内容包含不适当信息");
        }
        
        // Context window exceeded
        if (message.contains("context_length_exceeded") || 
                message.contains("maximum context length") ||
                message.contains("too long")) {
            return new ContextTooLongException(0, 0);  // 无法精确获取token数时用0
        }
        
        // 超时
        if (e instanceof java.util.concurrent.TimeoutException ||
                message.contains("timeout") || message.contains("timed out")) {
            return new AiException(AiErrorCode.SERVICE_TIMEOUT, 
                    "AI服务响应超时，请重试", true, true) {};
        }
        
        // 503 Service Unavailable
        if (message.contains("503") || message.contains("Service Unavailable") ||
                message.contains("overloaded")) {
            return new AiException(AiErrorCode.SERVICE_UNAVAILABLE,
                    "AI服务繁忙，请稍后重试", true, true) {};
        }
        
        // 未知错误，记录日志后包装
        log.error("未知的LLM异常，原始错误：{}", message, e);
        return new AiException(AiErrorCode.SERVICE_UNAVAILABLE,
                "AI服务出现异常，请稍后重试", false, false) {};
    }
    
    private int extractRetryAfter(String message) {
        // 尝试从错误消息中提取重试等待时间
        try {
            if (message.contains("retry after")) {
                String[] parts = message.split("retry after");
                if (parts.length > 1) {
                    String seconds = parts[1].trim().split("[^0-9]")[0];
                    return Integer.parseInt(seconds);
                }
            }
        } catch (Exception ignored) {}
        return 60;  // 默认60秒
    }
}

重试策略：智能重试而不是盲目重试

@Service
@RequiredArgsConstructor
@Slf4j
public class RetryableLlmService {
    
    private final ChatClient chatClient;
    private final LlmExceptionTranslator exceptionTranslator;
    
    // 重试配置
    private static final int MAX_RETRIES = 3;
    private static final long INITIAL_DELAY_MS = 1000;
    private static final double BACKOFF_MULTIPLIER = 2.0;
    
    public String chatWithRetry(String message) {
        int attempt = 0;
        long delayMs = INITIAL_DELAY_MS;
        
        while (attempt < MAX_RETRIES) {
            try {
                return chatClient.prompt()
                        .user(message)
                        .call()
                        .content();
                        
            } catch (Exception e) {
                AiException aiException = exceptionTranslator.translate(e);
                attempt++;
                
                // 不可重试的错误：直接抛出
                if (!aiException.isRetryable()) {
                    log.warn("不可重试的AI错误：{}，放弃重试", aiException.getMessage());
                    throw aiException;
                }
                
                // 达到最大重试次数
                if (attempt >= MAX_RETRIES) {
                    log.error("重试{}次后仍然失败", MAX_RETRIES);
                    throw aiException;
                }
                
                // 限流错误：等待更长时间
                if (aiException instanceof RateLimitedException rateLimited) {
                    delayMs = rateLimited.getRetryAfterSeconds() * 1000L;
                }
                
                log.warn("AI调用失败（第{}次），{}ms后重试：{}",
                        attempt, delayMs, aiException.getMessage());
                
                try {
                    Thread.sleep(delayMs);
                } catch (InterruptedException ie) {
                    Thread.currentThread().interrupt();
                    throw new RuntimeException("重试被中断", ie);
                }
                
                // 指数退避
                delayMs = (long) (delayMs * BACKOFF_MULTIPLIER);
            }
        }
        
        throw new RuntimeException("不应该到达这里");
    }
}

全局异常处理器：统一的用户响应

@RestControllerAdvice
@Slf4j
public class AiGlobalExceptionHandler {
    
    @ExceptionHandler(AiException.class)
    public ResponseEntity<ErrorResponse> handleAiException(AiException e, HttpServletRequest request) {
        
        // 记录日志（内部错误记ERROR，用户操作错误记WARN）
        if (e.isUserVisible()) {
            log.warn("AI业务异常：code={}, message={}", e.getCode(), e.getMessage());
        } else {
            log.error("AI系统异常：code={}", e.getCode(), e);
        }
        
        // 构建用户响应
        String userMessage;
        int httpStatus;
        
        switch (e.getCode()) {
            case RATE_LIMITED -> {
                userMessage = e.getMessage();
                httpStatus = 429;
            }
            case CONTENT_FILTERED -> {
                userMessage = "您的输入包含不适当内容，请修改后重试";
                httpStatus = 400;
            }
            case CONTEXT_TOO_LONG -> {
                userMessage = "输入内容过长，请缩短后重试（建议控制在2000字以内）";
                httpStatus = 400;
            }
            case SERVICE_TIMEOUT, SERVICE_UNAVAILABLE -> {
                userMessage = "AI服务繁忙，请稍后重试";
                httpStatus = 503;
            }
            case AUTHENTICATION_FAILED -> {
                // 认证失败不对用户暴露，显示通用错误
                userMessage = "系统异常，请联系客服";
                httpStatus = 500;
            }
            default -> {
                userMessage = e.isUserVisible() ? e.getMessage() : "AI服务出现异常，请稍后重试";
                httpStatus = 500;
            }
        }
        
        ErrorResponse errorResponse = new ErrorResponse(
                e.getCode().code,
                userMessage,
                request.getRequestURI(),
                System.currentTimeMillis()
        );
        
        // 限流时在响应头加上重试时间
        if (e instanceof RateLimitedException rateLimited) {
            return ResponseEntity.status(httpStatus)
                    .header("Retry-After", String.valueOf(rateLimited.getRetryAfterSeconds()))
                    .body(errorResponse);
        }
        
        return ResponseEntity.status(httpStatus).body(errorResponse);
    }
    
    @ExceptionHandler(OutputParseException.class)
    public ResponseEntity<ErrorResponse> handleOutputParseException(
            OutputParseException e, HttpServletRequest request) {
        
        log.error("AI输出解析失败，原始输出：{}", 
                e.getRawOutput().substring(0, Math.min(200, e.getRawOutput().length())), e);
        
        // 输出解析失败时，返回原始文本作为降级响应
        return ResponseEntity.ok(new ErrorResponse(
                "AI_DEGRADED",
                "AI返回了非结构化内容，已作为文本处理",
                request.getRequestURI(),
                System.currentTimeMillis()
        ));
    }
    
    public record ErrorResponse(String code, String message, String path, long timestamp) {}
}

降级策略：出错时仍然给用户有价值的回应

@Service
@RequiredArgsConstructor
public class AiServiceWithFallback {
    
    private final ChatClient primaryClient;   // 主要的LLM（如GPT-4o）
    private final ChatClient fallbackClient;  // 降级的LLM（如GPT-3.5-turbo，便宜且稳定）
    private final LlmExceptionTranslator translator;
    
    public String chatWithFallback(String message) {
        // 先用主模型
        try {
            return primaryClient.prompt().user(message).call().content();
        } catch (Exception e) {
            AiException aiException = translator.translate(e);
            log.warn("主模型调用失败，尝试降级：{}", aiException.getMessage());
            
            // 主模型超时或不可用时，降级到备用模型
            if (aiException.getCode() == AiErrorCode.SERVICE_TIMEOUT ||
                    aiException.getCode() == AiErrorCode.SERVICE_UNAVAILABLE) {
                try {
                    String fallbackResponse = fallbackClient.prompt()
                            .user(message)
                            .call()
                            .content();
                    
                    // 告知用户当前是降级模式
                    return "（当前使用备用模型，回答质量可能略有下降）\n\n" + fallbackResponse;
                    
                } catch (Exception fallbackException) {
                    log.error("主备模型均不可用", fallbackException);
                    throw aiException;  // 抛出原始异常
                }
            }
            
            throw aiException;
        }
    }
}

错误处理不是锦上添花，而是AI应用健壮性的基石。用户遇到错误时的体验，往往比功能本身更影响他们对产品的评价。