第2289篇:AI网关的设计与实现——统一的LLM请求代理层的工程方案
第2289篇:AI网关的设计与实现——统一的LLM请求代理层的工程方案
适读人群:负责AI平台建设的架构师和工程师 | 阅读时长:约17分钟 | 核心价值:掌握AI网关的核心功能设计,解决多模型管理、成本控制、安全审计等企业级问题
公司里的AI用量越来越大,有一天财务部门把账单发过来,我看了一下差点没坐稳:三个月,十几个业务系统合计花了七十万的AI API费用,但没有人能说清楚钱花在哪里了——因为各个系统自己持有不同的API Key,各自调用,各自付费,没有统一的视图。
更麻烦的是,当时有个业务系统用的是GPT-4o的Key,另一个系统用的是Claude的Key,第三个系统为了省钱自己接了一个国产模型。三套认证逻辑,三套错误处理代码,没有任何统一性。
从那时候起我开始认真设计AI网关。
AI网关要解决的核心问题
API网关的概念在微服务时代已经很成熟了,但AI网关有几个独特的需求:
成本分配与预算控制:每个业务系统/用户消耗了多少token,花了多少钱,要能精确统计。当某个系统的消耗超出预算,要能自动限流甚至断流。
多模型路由:不同请求路由到不同的AI提供商(Claude、OpenAI、国产模型),基于成本、能力、可用性动态选择。
安全内容过滤:在请求到达AI之前,过滤敏感信息(身份证号、手机号等PII数据);在响应返回之前,过滤不当内容。
请求/响应日志:所有AI交互的完整日志,用于审计、调试、计费。这在普通API网关里通常只记录请求头,但AI网关需要记录完整的prompt和response(同时注意隐私合规)。
缓存:相同的prompt多次调用可以缓存结果,直接节省费用。
整体架构:
网关的核心数据模型
// 租户配置(每个业务系统是一个租户)
@Entity
public class TenantConfig {
private String tenantId;
private String tenantName;
// 配额限制
private long monthlyTokenBudget; // 月度token预算
private int requestsPerMinuteLimit; // 每分钟请求数限制
private int maxTokensPerRequest; // 单次请求最大token
// 允许使用的模型
private List<String> allowedModels;
// 默认模型路由策略
private RoutingStrategy routingStrategy;
// 安全配置
private boolean piiFilterEnabled;
private List<String> blockedKeywords;
}
// 请求追踪记录
@Entity
public class AiRequestLog {
private String requestId;
private String tenantId;
private String userId;
private String model;
private String provider;
// Token统计
private int promptTokens;
private int completionTokens;
private int totalTokens;
// 成本计算
private BigDecimal cost;
// 请求/响应(加密存储)
private String encryptedPrompt;
private String encryptedResponse;
// 性能指标
private long latencyMs;
private boolean cached;
// 内容安全
private boolean piiDetected;
private boolean contentFiltered;
private Instant createdAt;
}核心过滤链实现
用责任链模式实现网关的处理流水线:
public interface GatewayFilter {
GatewayResponse filter(GatewayRequest request, FilterChain chain);
}
@Component
@Order(1)
public class AuthenticationFilter implements GatewayFilter {
private final TenantRepository tenantRepository;
private final JwtValidator jwtValidator;
@Override
public GatewayResponse filter(GatewayRequest request, FilterChain chain) {
String apiKey = request.getHeader("X-API-Key");
if (apiKey == null) {
return GatewayResponse.unauthorized("Missing API key");
}
TenantConfig tenant = tenantRepository.findByApiKey(apiKey)
.orElse(null);
if (tenant == null) {
return GatewayResponse.unauthorized("Invalid API key");
}
request.setAttribute("tenant", tenant);
return chain.proceed(request);
}
}
@Component
@Order(2)
public class QuotaFilter implements GatewayFilter {
private final RedisTemplate<String, Long> redis;
@Override
public GatewayResponse filter(GatewayRequest request, FilterChain chain) {
TenantConfig tenant = (TenantConfig) request.getAttribute("tenant");
// 检查每分钟请求数限制
String rpmKey = "rpm:" + tenant.getTenantId() + ":" + currentMinute();
Long rpm = redis.opsForValue().increment(rpmKey);
redis.expire(rpmKey, Duration.ofMinutes(2));
if (rpm > tenant.getRequestsPerMinuteLimit()) {
return GatewayResponse.tooManyRequests(
"Rate limit exceeded: " + rpm + "/" + tenant.getRequestsPerMinuteLimit() + " rpm"
);
}
// 检查月度token预算
String monthKey = "tokens:" + tenant.getTenantId() + ":" + currentMonth();
Long usedTokens = redis.opsForValue().get(monthKey) != null ?
redis.opsForValue().get(monthKey) : 0L;
if (usedTokens >= tenant.getMonthlyTokenBudget()) {
// 预算耗尽,发告警并拒绝请求
alertService.sendBudgetAlert(tenant.getTenantId(), usedTokens, tenant.getMonthlyTokenBudget());
return GatewayResponse.paymentRequired("Monthly token budget exceeded");
}
return chain.proceed(request);
}
}
@Component
@Order(3)
public class PiiFilter implements GatewayFilter {
// 中国常见PII正则
private static final Pattern PHONE_PATTERN =
Pattern.compile("1[3-9]\\d{9}");
private static final Pattern ID_CARD_PATTERN =
Pattern.compile("\\d{17}[\\dX]");
private static final Pattern BANK_CARD_PATTERN =
Pattern.compile("\\d{16,19}");
@Override
public GatewayResponse filter(GatewayRequest request, FilterChain chain) {
TenantConfig tenant = (TenantConfig) request.getAttribute("tenant");
if (!tenant.isPiiFilterEnabled()) {
return chain.proceed(request);
}
// 对prompt进行PII脱敏
String originalPrompt = request.getPrompt();
PiiSanitizationResult sanitized = sanitizePii(originalPrompt);
if (sanitized.isModified()) {
log.info("检测到PII数据,已脱敏: tenantId={}, types={}",
tenant.getTenantId(), sanitized.getDetectedTypes());
request.setPrompt(sanitized.getSanitizedText());
request.setAttribute("piiDetected", true);
}
return chain.proceed(request);
}
private PiiSanitizationResult sanitizePii(String text) {
String sanitized = text;
List<String> detected = new ArrayList<>();
// 手机号脱敏
Matcher phoneMatcher = PHONE_PATTERN.matcher(sanitized);
if (phoneMatcher.find()) {
detected.add("PHONE");
sanitized = phoneMatcher.replaceAll("[PHONE_REDACTED]");
}
// 身份证号脱敏
Matcher idMatcher = ID_CARD_PATTERN.matcher(sanitized);
if (idMatcher.find()) {
detected.add("ID_CARD");
sanitized = idMatcher.replaceAll("[ID_REDACTED]");
}
// 银行卡号脱敏
Matcher bankMatcher = BANK_CARD_PATTERN.matcher(sanitized);
if (bankMatcher.find()) {
detected.add("BANK_CARD");
sanitized = bankMatcher.replaceAll("[BANK_CARD_REDACTED]");
}
return new PiiSanitizationResult(sanitized, detected, !detected.isEmpty());
}
}
@Component
@Order(4)
public class SemanticCacheFilter implements GatewayFilter {
private final VectorStore vectorStore;
private final double SIMILARITY_THRESHOLD = 0.95; // 相似度阈值
@Override
public GatewayResponse filter(GatewayRequest request, FilterChain chain) {
// 只对幂等性好的请求做语义缓存(排除对话类请求)
if (!request.isCacheable()) {
return chain.proceed(request);
}
// 检查语义相似的缓存
Optional<CachedResponse> cached = vectorStore.findSimilar(
request.getPrompt(), SIMILARITY_THRESHOLD
);
if (cached.isPresent()) {
log.debug("语义缓存命中: similarity={}", cached.get().getSimilarity());
return GatewayResponse.fromCache(cached.get().getResponse());
}
// 执行实际请求
GatewayResponse response = chain.proceed(request);
// 存入缓存(TTL根据内容类型设置)
if (response.isSuccess()) {
vectorStore.store(request.getPrompt(), response.getContent(),
determineCacheTtl(request));
}
return response;
}
}模型路由器:智能选择AI提供商
@Component
public class ModelRouter {
private final Map<String, AiProviderClient> providers;
private final ProviderHealthTracker healthTracker;
public AiProviderClient selectProvider(GatewayRequest request) {
TenantConfig tenant = (TenantConfig) request.getAttribute("tenant");
String requestedModel = request.getModel();
// 1. 如果指定了具体模型,检查租户权限后直接路由
if (requestedModel != null) {
if (!tenant.getAllowedModels().contains(requestedModel)) {
throw new ModelNotAllowedException(requestedModel, tenant.getTenantId());
}
return providers.get(requestedModel);
}
// 2. 按路由策略选择
return switch (tenant.getRoutingStrategy()) {
case CHEAPEST -> selectCheapestAvailable(tenant, request);
case FASTEST -> selectFastestAvailable(tenant, request);
case BEST_QUALITY -> selectHighestQuality(tenant, request);
case ROUND_ROBIN -> selectRoundRobin(tenant);
};
}
private AiProviderClient selectCheapestAvailable(TenantConfig tenant, GatewayRequest req) {
// 估算token数
int estimatedTokens = tokenEstimator.estimate(req.getPrompt());
// 找健康的、且在配额内的最便宜提供商
return tenant.getAllowedModels().stream()
.filter(model -> healthTracker.isHealthy(model))
.min(Comparator.comparingDouble(model ->
calculateCost(model, estimatedTokens)))
.map(providers::get)
.orElseThrow(() -> new NoAvailableProviderException());
}
private double calculateCost(String model, int tokens) {
// 各模型的价格表
Map<String, Double> pricePerMillionTokens = Map.of(
"claude-haiku-3", 0.25,
"claude-sonnet-3-5", 3.0,
"gpt-4o-mini", 0.15,
"gpt-4o", 5.0
);
return pricePerMillionTokens.getOrDefault(model, 10.0) * tokens / 1_000_000.0;
}
}计费和成本统计
@Service
public class BillingService {
private final AiRequestLogRepository logRepository;
private final RedisTemplate<String, Long> redis;
/**
* 记录请求的实际消耗
*/
@Async
public void recordUsage(AiRequestLog log) {
// 异步写日志,不阻塞主流程
logRepository.save(log);
// 更新Redis计数器(用于实时限流)
String tokenKey = "tokens:" + log.getTenantId() + ":" + currentMonth();
redis.opsForValue().increment(tokenKey, log.getTotalTokens());
redis.expireAt(tokenKey, endOfMonth());
// 更新成本统计
String costKey = "cost:" + log.getTenantId() + ":" + currentMonth();
redis.opsForValue().increment(costKey, log.getCost().multiply(BigDecimal.valueOf(100)).longValue());
redis.expireAt(costKey, endOfMonth());
}
/**
* 获取租户当月使用报告
*/
public UsageReport getMonthlyReport(String tenantId, YearMonth month) {
List<AiRequestLog> logs = logRepository.findByTenantIdAndMonth(tenantId, month);
return UsageReport.builder()
.tenantId(tenantId)
.month(month)
.totalRequests(logs.size())
.totalTokens(logs.stream().mapToLong(AiRequestLog::getTotalTokens).sum())
.totalCost(logs.stream().map(AiRequestLog::getCost).reduce(BigDecimal.ZERO, BigDecimal::add))
.byModel(groupByModel(logs))
.cacheHitRate(calculateCacheHitRate(logs))
.averageLatencyMs(logs.stream().mapToLong(AiRequestLog::getLatencyMs).average().orElse(0))
.build();
}
}AI网关是AI平台化的基础设施。有了它,才能真正做到全局的成本可见、安全可控、能力可共享。它不是一蹴而就的,可以先从最基础的认证+日志+限流开始,逐步加入语义缓存、PII过滤、智能路由等高级功能。
