第2067篇:LLM成本优化实战——把API费用降低60%的六个方法
2026/4/30大约 9 分钟
第2067篇:LLM成本优化实战——把API费用降低60%的六个方法
适读人群:关注AI应用成本的工程师和技术负责人 | 阅读时长:约18分钟 | 核心价值:通过缓存、模型路由、Token压缩等手段,在不影响质量的前提下大幅降低LLM调用成本
上个季度我们AI产品的OpenAI账单涨到了让老板皱眉头的程度。于是我们做了一次系统性的成本优化,最终把API费用降了62%,同时用户体验基本没有退化。
这篇文章把我们的优化手段一一拆解。
先建立成本监控
优化之前先要知道钱花在哪里:
/**
* LLM成本追踪
* 按功能、用户、模型分维度统计成本
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmCostTracker {
// GPT-4o的价格(美元/1K tokens)
private static final Map<String, double[]> MODEL_PRICES = Map.of(
"gpt-4o", new double[]{0.005, 0.015}, // input, output
"gpt-4o-mini", new double[]{0.00015, 0.0006},
"gpt-3.5-turbo", new double[]{0.0005, 0.0015},
"claude-3-5-haiku",new double[]{0.0008, 0.004}
);
private final MeterRegistry meterRegistry;
private final RedisTemplate<String, String> redis;
/**
* 记录一次LLM调用的成本
*/
public void recordCost(
String featureName,
String userId,
String model,
int inputTokens,
int outputTokens) {
double cost = calculateCost(model, inputTokens, outputTokens);
// Prometheus指标
Counter.builder("llm.cost.usd")
.tag("feature", featureName)
.tag("model", model)
.register(meterRegistry)
.increment(cost);
// Redis累计(用于日/月报表)
String dayKey = "llm:cost:" + LocalDate.now().toString();
String featureKey = "llm:cost:feature:" + featureName + ":" + LocalDate.now();
redis.opsForValue().increment(dayKey + ":total", (long)(cost * 1_000_000)); // 存微分
redis.opsForValue().increment(featureKey, (long)(cost * 1_000_000));
// 超支告警
if (cost > 0.1) { // 单次超过0.1美元就记录
log.warn("高成本LLM调用: feature={}, model={}, cost=${:.4f}, tokens={}/{}",
featureName, model, cost, inputTokens, outputTokens);
}
}
public double calculateCost(String model, int inputTokens, int outputTokens) {
double[] prices = MODEL_PRICES.getOrDefault(model, new double[]{0.005, 0.015});
return (inputTokens * prices[0] + outputTokens * prices[1]) / 1000;
}
/**
* 获取今日成本报告
*/
public CostReport getDailyCostReport() {
String dayKey = "llm:cost:" + LocalDate.now().toString();
String totalStr = redis.opsForValue().get(dayKey + ":total");
double totalUsd = totalStr != null ? Long.parseLong(totalStr) / 1_000_000.0 : 0;
return new CostReport(LocalDate.now(), totalUsd);
}
public record CostReport(LocalDate date, double totalUsd) {}
}优化一:语义缓存(效果最明显)
用户问题往往有大量重复:
/**
* 语义缓存——相似问题复用答案
* 不需要完全相同,语义相近就命中缓存
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class SmartSemanticCache {
private final EmbeddingModel embeddingModel;
private final RedisTemplate<String, String> redis;
private final ObjectMapper objectMapper;
private static final String CACHE_KEY_PREFIX = "semantic_cache:";
private static final double SIMILARITY_THRESHOLD = 0.92; // 相似度阈值
private static final Duration CACHE_TTL = Duration.ofHours(24);
@Data @Builder
private static class CacheEntry {
private String question;
private float[] embedding;
private String answer;
private long hitCount;
private LocalDateTime createdAt;
}
/**
* 查找语义相似的缓存
*/
public Optional<String> findSimilarCached(String question) {
float[] queryEmb = embeddingModel.embed(question);
// 扫描缓存中的所有条目,找相似度最高的
Set<String> keys = redis.keys(CACHE_KEY_PREFIX + "*");
if (keys == null || keys.isEmpty()) return Optional.empty();
double bestSimilarity = 0;
String bestAnswer = null;
String bestKey = null;
for (String key : keys) {
String json = redis.opsForValue().get(key);
if (json == null) continue;
try {
CacheEntry entry = objectMapper.readValue(json, CacheEntry.class);
double similarity = cosineSimilarity(queryEmb, entry.getEmbedding());
if (similarity > bestSimilarity) {
bestSimilarity = similarity;
bestAnswer = entry.getAnswer();
bestKey = key;
}
} catch (Exception e) {
// 忽略解析失败的条目
}
}
if (bestSimilarity >= SIMILARITY_THRESHOLD) {
log.debug("语义缓存命中: similarity={:.3f}, key={}", bestSimilarity, bestKey);
// 更新命中计数(异步,不阻塞主流程)
updateHitCount(bestKey);
return Optional.of(bestAnswer);
}
return Optional.empty();
}
/**
* 写入缓存
*/
public void put(String question, String answer) {
float[] embedding = embeddingModel.embed(question);
CacheEntry entry = CacheEntry.builder()
.question(question)
.embedding(embedding)
.answer(answer)
.hitCount(0)
.createdAt(LocalDateTime.now())
.build();
String key = CACHE_KEY_PREFIX + UUID.randomUUID();
try {
String json = objectMapper.writeValueAsString(entry);
redis.opsForValue().set(key, json, CACHE_TTL);
} catch (JsonProcessingException e) {
log.warn("缓存写入失败: {}", e.getMessage());
}
}
private double cosineSimilarity(float[] a, float[] b) {
if (a.length != b.length) return 0;
double dot = 0, normA = 0, normB = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
private void updateHitCount(String key) {
CompletableFuture.runAsync(() -> {
try {
String json = redis.opsForValue().get(key);
if (json != null) {
CacheEntry entry = objectMapper.readValue(json, CacheEntry.class);
entry.setHitCount(entry.getHitCount() + 1);
redis.opsForValue().set(key, objectMapper.writeValueAsString(entry), CACHE_TTL);
}
} catch (Exception ignored) {}
});
}
}优化二:模型分级路由
不是每个问题都需要最贵的模型:
/**
* 基于问题复杂度的智能路由
* 简单问题用便宜的模型,复杂问题才用贵的
*/
@Service
@RequiredArgsConstructor
public class CostAwareModelRouter {
private final ChatLanguageModel gpt4o; // $0.005/$0.015 per 1K tokens
private final ChatLanguageModel gpt4oMini; // $0.00015/$0.0006 per 1K tokens
private final ChatLanguageModel localModel; // 免费(自己部署)
/**
* 根据问题特征路由到最合适的模型
* 目标:在满足质量要求的前提下最小化成本
*/
public String route(String question, RouteHint hint) {
ModelChoice choice = selectModel(question, hint);
log.debug("模型路由: {} → {}",
question.substring(0, Math.min(30, question.length())), choice.modelName());
return switch (choice) {
case LOCAL -> localModel.generate(question);
case MINI -> gpt4oMini.generate(question);
case FULL -> gpt4o.generate(question);
};
}
private ModelChoice selectModel(String question, RouteHint hint) {
// 1. 显式指定
if (hint == RouteHint.FORCE_PREMIUM) return ModelChoice.FULL;
if (hint == RouteHint.FORCE_CHEAP) return ModelChoice.LOCAL;
// 2. 简单意图识别(不需要LLM,用规则)
if (isSimpleFactualQuestion(question)) return ModelChoice.LOCAL;
if (isComplexAnalysisQuestion(question)) return ModelChoice.FULL;
// 3. 默认用中档
return ModelChoice.MINI;
}
private boolean isSimpleFactualQuestion(String question) {
// 简单的事实查询特征
String[] simplePatterns = {
".*是什么.*", ".*有没有.*", ".*多少.*",
".*是否.*", ".*能不能.*", ".*可以.*吗"
};
for (String pattern : simplePatterns) {
if (question.matches(pattern)) return true;
}
return question.length() < 30; // 很短的问题通常是简单问题
}
private boolean isComplexAnalysisQuestion(String question) {
// 复杂分析特征
String[] complexKeywords = {"分析", "比较", "评估", "设计方案", "代码审查", "优化策略"};
for (String keyword : complexKeywords) {
if (question.contains(keyword)) return true;
}
return question.length() > 500; // 很长的上下文往往需要强模型
}
enum ModelChoice { LOCAL, MINI, FULL }
enum RouteHint { AUTO, FORCE_PREMIUM, FORCE_CHEAP }
}优化三:Token压缩
System Prompt往往很长,可以压缩:
/**
* System Prompt压缩优化
* 去掉冗余文字,减少每次调用的Token消耗
*/
@Service
@RequiredArgsConstructor
public class PromptTokenOptimizer {
private final Tokenizer tokenizer;
/**
* 压缩Prompt(去掉对AI理解无影响的冗余)
*/
public OptimizedPrompt compress(String originalPrompt) {
int originalTokens = tokenizer.countTokens(originalPrompt);
String compressed = originalPrompt;
// 1. 去掉多余空行(多个换行变一个)
compressed = compressed.replaceAll("\n{3,}", "\n\n");
// 2. 去掉段落开头的空格
compressed = compressed.replaceAll("(?m)^[ \\t]+", "");
// 3. 压缩重复的标点
compressed = compressed.replaceAll("[。!?]{2,}", "。");
// 4. 去掉AI不需要的礼貌性套话
compressed = removeRedundantPolitePhrase(compressed);
int compressedTokens = tokenizer.countTokens(compressed);
double compressionRatio = 1.0 - (double) compressedTokens / originalTokens;
return new OptimizedPrompt(compressed, originalTokens, compressedTokens, compressionRatio);
}
private String removeRedundantPolitePhrase(String prompt) {
// 去掉一些对模型行为没有实际影响的套话
String[] redundantPhrases = {
"请你认真思考,",
"作为一个专业的AI助手,",
"我希望你能够",
"请务必"
};
for (String phrase : redundantPhrases) {
prompt = prompt.replace(phrase, "");
}
return prompt;
}
/**
* 截断过长的上下文,保留最相关的部分
*/
public String truncateContext(String context, int maxTokens) {
int currentTokens = tokenizer.countTokens(context);
if (currentTokens <= maxTokens) return context;
// 策略:保留开头和结尾,中间截断(开头通常有重要说明,结尾是最近的对话)
String[] lines = context.split("\n");
int targetLines = (int)(lines.length * (double) maxTokens / currentTokens);
int headLines = targetLines * 2 / 3;
int tailLines = targetLines - headLines;
StringBuilder result = new StringBuilder();
for (int i = 0; i < Math.min(headLines, lines.length); i++) {
result.append(lines[i]).append("\n");
}
result.append("\n... [部分内容已省略] ...\n\n");
for (int i = Math.max(0, lines.length - tailLines); i < lines.length; i++) {
result.append(lines[i]).append("\n");
}
return result.toString();
}
public record OptimizedPrompt(
String content,
int originalTokens,
int optimizedTokens,
double compressionRatio
) {}
}优化四:批处理聚合
多个短请求合批一次调用:
/**
* 请求批处理
* 将多个小请求聚合成一个批量调用
* 适用于:批量翻译、批量分类、批量摘要等场景
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class BatchRequestProcessor {
private final ChatLanguageModel llm;
/**
* 批量文本分类
* 比单条调用便宜约80%
*/
public List<ClassificationResult> batchClassify(
List<String> texts,
List<String> categories) {
if (texts.isEmpty()) return List.of();
// 构建批量分类Prompt
StringBuilder prompt = new StringBuilder();
prompt.append("请对以下").append(texts.size()).append("段文本进行分类。\n");
prompt.append("分类选项:").append(String.join("/", categories)).append("\n\n");
prompt.append("按JSON数组格式输出,每个元素包含index和category:\n\n");
for (int i = 0; i < texts.size(); i++) {
prompt.append("文本").append(i).append(": ").append(texts.get(i)).append("\n");
}
prompt.append("\n输出格式:[{\"index\":0,\"category\":\"分类名\"}, ...]");
String response = llm.generate(prompt.toString());
return parseBatchResults(response, texts.size());
}
private List<ClassificationResult> parseBatchResults(String response, int expectedCount) {
try {
String json = extractJsonArray(response);
ObjectMapper mapper = new ObjectMapper();
List<Map<String, Object>> raw = mapper.readValue(
json, new TypeReference<>() {});
return raw.stream()
.map(item -> new ClassificationResult(
((Number) item.get("index")).intValue(),
(String) item.get("category")
))
.toList();
} catch (Exception e) {
log.warn("批量分类结果解析失败: {}", e.getMessage());
// 解析失败,返回默认分类
List<ClassificationResult> defaults = new ArrayList<>();
for (int i = 0; i < expectedCount; i++) {
defaults.add(new ClassificationResult(i, "未知"));
}
return defaults;
}
}
private String extractJsonArray(String text) {
int start = text.indexOf('[');
int end = text.lastIndexOf(']');
return start >= 0 && end > start ? text.substring(start, end + 1) : "[]";
}
public record ClassificationResult(int index, String category) {}
}优化五:输出长度控制
AI输出默认会比较啰嗦,控制输出长度直接省钱:
/**
* 输出长度控制
* 通过Prompt指令控制AI输出的长度
*/
public class OutputLengthController {
/**
* 在Prompt中添加长度控制指令
*/
public static String addLengthConstraint(String originalPrompt, OutputMode mode) {
String constraint = switch (mode) {
case CONCISE -> "\n\n注意:回答要简洁,控制在100字以内,只给出核心答案。";
case NORMAL -> "\n\n注意:回答控制在300字以内,重点突出。";
case DETAILED -> "\n\n请详细说明,但避免重复,控制在800字以内。";
case ULTRA_SHORT -> "\n\n用一句话回答(30字以内)。";
};
return originalPrompt + constraint;
}
/**
* 根据请求场景自动选择输出模式
*/
public static OutputMode inferOutputMode(String question, String feature) {
// 聊天场景通常不需要很长的回答
if ("chat".equals(feature)) return OutputMode.CONCISE;
// 确认性问题,一句话就够
if (question.matches(".*是不是.*|.*对不对.*|.*可以吗.*")) {
return OutputMode.ULTRA_SHORT;
}
// 分析类问题需要详细
if (question.contains("分析") || question.contains("解释")) {
return OutputMode.DETAILED;
}
return OutputMode.NORMAL;
}
public enum OutputMode { ULTRA_SHORT, CONCISE, NORMAL, DETAILED }
}优化六:本地模型兜底
把低风险的请求路由到免费的本地模型:
/**
* 本地+云端混合模型策略
* 对低复杂度的请求,优先使用本地免费模型
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class HybridModelService {
private final ChatLanguageModel cloudModel; // OpenAI/Claude
private final ChatLanguageModel localModel; // Ollama本地模型
/**
* 智能混合调用
* 先用本地模型,质量不达标再升级到云端
*/
public String generateWithFallup(String prompt, double qualityThreshold) {
// 先用本地模型(免费)
String localResponse = localModel.generate(prompt);
// 质量检查(快速启发式规则,不调用额外API)
if (isQualitySufficient(localResponse, prompt, qualityThreshold)) {
log.debug("本地模型回答质量达标,节省云端调用");
return localResponse;
}
// 本地不够好,升级到云端
log.debug("本地模型质量不足,升级到云端模型");
return cloudModel.generate(prompt);
}
private boolean isQualitySufficient(String response, String prompt, double threshold) {
// 启发式质量检查(不需要额外LLM调用)
// 1. 回答不能太短
if (response.length() < 20) return false;
// 2. 不能是模型不知道的标准回答
if (response.contains("作为AI") || response.contains("我无法")) return false;
// 3. 回答应该与问题相关(简单词汇重叠检查)
String[] questionWords = prompt.split("[\\s,。?!]+");
int overlap = 0;
for (String word : questionWords) {
if (word.length() > 1 && response.contains(word)) overlap++;
}
double relevance = (double) overlap / questionWords.length;
return relevance >= threshold;
}
}优化效果
| 优化手段 | 节省比例 | 实施难度 | 质量影响 |
|---|---|---|---|
| 语义缓存 | 30-40% | 中 | 无 |
| 模型路由 | 40-60% | 中 | 极小 |
| Token压缩 | 10-15% | 低 | 无 |
| 批处理 | 50-70% | 中(需改业务逻辑) | 无 |
| 输出长度控制 | 20-30% | 低 | 取决于场景 |
| 本地模型兜底 | 30-50% | 高(需要部署) | 低风险场景无影响 |
这六个手段组合使用,我们实现了62%的成本降低。其中语义缓存和模型路由是性价比最高的两个——实施不复杂,效果最显著。
