第2067篇：LLM成本优化实战——把API费用降低60%的六个方法

老张2026/4/30大约 9 分钟

第2067篇：LLM成本优化实战——把API费用降低60%的六个方法

适读人群：关注AI应用成本的工程师和技术负责人 | 阅读时长：约18分钟 | 核心价值：通过缓存、模型路由、Token压缩等手段，在不影响质量的前提下大幅降低LLM调用成本

上个季度我们AI产品的OpenAI账单涨到了让老板皱眉头的程度。于是我们做了一次系统性的成本优化，最终把API费用降了62%，同时用户体验基本没有退化。

这篇文章把我们的优化手段一一拆解。

先建立成本监控

优化之前先要知道钱花在哪里：

/**
 * LLM成本追踪
 * 按功能、用户、模型分维度统计成本
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmCostTracker {
    
    // GPT-4o的价格（美元/1K tokens）
    private static final Map<String, double[]> MODEL_PRICES = Map.of(
        "gpt-4o",          new double[]{0.005, 0.015},   // input, output
        "gpt-4o-mini",     new double[]{0.00015, 0.0006},
        "gpt-3.5-turbo",   new double[]{0.0005, 0.0015},
        "claude-3-5-haiku",new double[]{0.0008, 0.004}
    );
    
    private final MeterRegistry meterRegistry;
    private final RedisTemplate<String, String> redis;
    
    /**
     * 记录一次LLM调用的成本
     */
    public void recordCost(
            String featureName,
            String userId,
            String model,
            int inputTokens,
            int outputTokens) {
        
        double cost = calculateCost(model, inputTokens, outputTokens);
        
        // Prometheus指标
        Counter.builder("llm.cost.usd")
            .tag("feature", featureName)
            .tag("model", model)
            .register(meterRegistry)
            .increment(cost);
        
        // Redis累计（用于日/月报表）
        String dayKey = "llm:cost:" + LocalDate.now().toString();
        String featureKey = "llm:cost:feature:" + featureName + ":" + LocalDate.now();
        
        redis.opsForValue().increment(dayKey + ":total", (long)(cost * 1_000_000)); // 存微分
        redis.opsForValue().increment(featureKey, (long)(cost * 1_000_000));
        
        // 超支告警
        if (cost > 0.1) {  // 单次超过0.1美元就记录
            log.warn("高成本LLM调用: feature={}, model={}, cost=${:.4f}, tokens={}/{}", 
                featureName, model, cost, inputTokens, outputTokens);
        }
    }
    
    public double calculateCost(String model, int inputTokens, int outputTokens) {
        double[] prices = MODEL_PRICES.getOrDefault(model, new double[]{0.005, 0.015});
        return (inputTokens * prices[0] + outputTokens * prices[1]) / 1000;
    }
    
    /**
     * 获取今日成本报告
     */
    public CostReport getDailyCostReport() {
        String dayKey = "llm:cost:" + LocalDate.now().toString();
        String totalStr = redis.opsForValue().get(dayKey + ":total");
        double totalUsd = totalStr != null ? Long.parseLong(totalStr) / 1_000_000.0 : 0;
        
        return new CostReport(LocalDate.now(), totalUsd);
    }
    
    public record CostReport(LocalDate date, double totalUsd) {}
}

优化一：语义缓存（效果最明显）

用户问题往往有大量重复：

/**
 * 语义缓存——相似问题复用答案
 * 不需要完全相同，语义相近就命中缓存
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class SmartSemanticCache {
    
    private final EmbeddingModel embeddingModel;
    private final RedisTemplate<String, String> redis;
    private final ObjectMapper objectMapper;
    
    private static final String CACHE_KEY_PREFIX = "semantic_cache:";
    private static final double SIMILARITY_THRESHOLD = 0.92;  // 相似度阈值
    private static final Duration CACHE_TTL = Duration.ofHours(24);
    
    @Data @Builder
    private static class CacheEntry {
        private String question;
        private float[] embedding;
        private String answer;
        private long hitCount;
        private LocalDateTime createdAt;
    }
    
    /**
     * 查找语义相似的缓存
     */
    public Optional<String> findSimilarCached(String question) {
        float[] queryEmb = embeddingModel.embed(question);
        
        // 扫描缓存中的所有条目，找相似度最高的
        Set<String> keys = redis.keys(CACHE_KEY_PREFIX + "*");
        if (keys == null || keys.isEmpty()) return Optional.empty();
        
        double bestSimilarity = 0;
        String bestAnswer = null;
        String bestKey = null;
        
        for (String key : keys) {
            String json = redis.opsForValue().get(key);
            if (json == null) continue;
            
            try {
                CacheEntry entry = objectMapper.readValue(json, CacheEntry.class);
                double similarity = cosineSimilarity(queryEmb, entry.getEmbedding());
                
                if (similarity > bestSimilarity) {
                    bestSimilarity = similarity;
                    bestAnswer = entry.getAnswer();
                    bestKey = key;
                }
            } catch (Exception e) {
                // 忽略解析失败的条目
            }
        }
        
        if (bestSimilarity >= SIMILARITY_THRESHOLD) {
            log.debug("语义缓存命中: similarity={:.3f}, key={}", bestSimilarity, bestKey);
            // 更新命中计数（异步，不阻塞主流程）
            updateHitCount(bestKey);
            return Optional.of(bestAnswer);
        }
        
        return Optional.empty();
    }
    
    /**
     * 写入缓存
     */
    public void put(String question, String answer) {
        float[] embedding = embeddingModel.embed(question);
        
        CacheEntry entry = CacheEntry.builder()
            .question(question)
            .embedding(embedding)
            .answer(answer)
            .hitCount(0)
            .createdAt(LocalDateTime.now())
            .build();
        
        String key = CACHE_KEY_PREFIX + UUID.randomUUID();
        
        try {
            String json = objectMapper.writeValueAsString(entry);
            redis.opsForValue().set(key, json, CACHE_TTL);
        } catch (JsonProcessingException e) {
            log.warn("缓存写入失败: {}", e.getMessage());
        }
    }
    
    private double cosineSimilarity(float[] a, float[] b) {
        if (a.length != b.length) return 0;
        
        double dot = 0, normA = 0, normB = 0;
        for (int i = 0; i < a.length; i++) {
            dot += a[i] * b[i];
            normA += a[i] * a[i];
            normB += b[i] * b[i];
        }
        return dot / (Math.sqrt(normA) * Math.sqrt(normB));
    }
    
    private void updateHitCount(String key) {
        CompletableFuture.runAsync(() -> {
            try {
                String json = redis.opsForValue().get(key);
                if (json != null) {
                    CacheEntry entry = objectMapper.readValue(json, CacheEntry.class);
                    entry.setHitCount(entry.getHitCount() + 1);
                    redis.opsForValue().set(key, objectMapper.writeValueAsString(entry), CACHE_TTL);
                }
            } catch (Exception ignored) {}
        });
    }
}

优化二：模型分级路由

不是每个问题都需要最贵的模型：

/**
 * 基于问题复杂度的智能路由
 * 简单问题用便宜的模型，复杂问题才用贵的
 */
@Service
@RequiredArgsConstructor
public class CostAwareModelRouter {
    
    private final ChatLanguageModel gpt4o;       // $0.005/$0.015 per 1K tokens
    private final ChatLanguageModel gpt4oMini;   // $0.00015/$0.0006 per 1K tokens
    private final ChatLanguageModel localModel;  // 免费（自己部署）
    
    /**
     * 根据问题特征路由到最合适的模型
     * 目标：在满足质量要求的前提下最小化成本
     */
    public String route(String question, RouteHint hint) {
        ModelChoice choice = selectModel(question, hint);
        log.debug("模型路由: {} → {}", 
            question.substring(0, Math.min(30, question.length())), choice.modelName());
        
        return switch (choice) {
            case LOCAL -> localModel.generate(question);
            case MINI -> gpt4oMini.generate(question);
            case FULL -> gpt4o.generate(question);
        };
    }
    
    private ModelChoice selectModel(String question, RouteHint hint) {
        // 1. 显式指定
        if (hint == RouteHint.FORCE_PREMIUM) return ModelChoice.FULL;
        if (hint == RouteHint.FORCE_CHEAP) return ModelChoice.LOCAL;
        
        // 2. 简单意图识别（不需要LLM，用规则）
        if (isSimpleFactualQuestion(question)) return ModelChoice.LOCAL;
        if (isComplexAnalysisQuestion(question)) return ModelChoice.FULL;
        
        // 3. 默认用中档
        return ModelChoice.MINI;
    }
    
    private boolean isSimpleFactualQuestion(String question) {
        // 简单的事实查询特征
        String[] simplePatterns = {
            ".*是什么.*", ".*有没有.*", ".*多少.*", 
            ".*是否.*", ".*能不能.*", ".*可以.*吗"
        };
        
        for (String pattern : simplePatterns) {
            if (question.matches(pattern)) return true;
        }
        return question.length() < 30;  // 很短的问题通常是简单问题
    }
    
    private boolean isComplexAnalysisQuestion(String question) {
        // 复杂分析特征
        String[] complexKeywords = {"分析", "比较", "评估", "设计方案", "代码审查", "优化策略"};
        for (String keyword : complexKeywords) {
            if (question.contains(keyword)) return true;
        }
        return question.length() > 500;  // 很长的上下文往往需要强模型
    }
    
    enum ModelChoice { LOCAL, MINI, FULL }
    enum RouteHint { AUTO, FORCE_PREMIUM, FORCE_CHEAP }
}

优化三：Token压缩

System Prompt往往很长，可以压缩：

/**
 * System Prompt压缩优化
 * 去掉冗余文字，减少每次调用的Token消耗
 */
@Service
@RequiredArgsConstructor
public class PromptTokenOptimizer {
    
    private final Tokenizer tokenizer;
    
    /**
     * 压缩Prompt（去掉对AI理解无影响的冗余）
     */
    public OptimizedPrompt compress(String originalPrompt) {
        int originalTokens = tokenizer.countTokens(originalPrompt);
        
        String compressed = originalPrompt;
        
        // 1. 去掉多余空行（多个换行变一个）
        compressed = compressed.replaceAll("\n{3,}", "\n\n");
        
        // 2. 去掉段落开头的空格
        compressed = compressed.replaceAll("(?m)^[ \\t]+", "");
        
        // 3. 压缩重复的标点
        compressed = compressed.replaceAll("[。！？]{2,}", "。");
        
        // 4. 去掉AI不需要的礼貌性套话
        compressed = removeRedundantPolitePhrase(compressed);
        
        int compressedTokens = tokenizer.countTokens(compressed);
        double compressionRatio = 1.0 - (double) compressedTokens / originalTokens;
        
        return new OptimizedPrompt(compressed, originalTokens, compressedTokens, compressionRatio);
    }
    
    private String removeRedundantPolitePhrase(String prompt) {
        // 去掉一些对模型行为没有实际影响的套话
        String[] redundantPhrases = {
            "请你认真思考，",
            "作为一个专业的AI助手，",
            "我希望你能够",
            "请务必"
        };
        
        for (String phrase : redundantPhrases) {
            prompt = prompt.replace(phrase, "");
        }
        return prompt;
    }
    
    /**
     * 截断过长的上下文，保留最相关的部分
     */
    public String truncateContext(String context, int maxTokens) {
        int currentTokens = tokenizer.countTokens(context);
        if (currentTokens <= maxTokens) return context;
        
        // 策略：保留开头和结尾，中间截断（开头通常有重要说明，结尾是最近的对话）
        String[] lines = context.split("\n");
        int targetLines = (int)(lines.length * (double) maxTokens / currentTokens);
        
        int headLines = targetLines * 2 / 3;
        int tailLines = targetLines - headLines;
        
        StringBuilder result = new StringBuilder();
        for (int i = 0; i < Math.min(headLines, lines.length); i++) {
            result.append(lines[i]).append("\n");
        }
        result.append("\n... [部分内容已省略] ...\n\n");
        for (int i = Math.max(0, lines.length - tailLines); i < lines.length; i++) {
            result.append(lines[i]).append("\n");
        }
        
        return result.toString();
    }
    
    public record OptimizedPrompt(
        String content,
        int originalTokens,
        int optimizedTokens,
        double compressionRatio
    ) {}
}

优化四：批处理聚合

多个短请求合批一次调用：

/**
 * 请求批处理
 * 将多个小请求聚合成一个批量调用
 * 适用于：批量翻译、批量分类、批量摘要等场景
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class BatchRequestProcessor {
    
    private final ChatLanguageModel llm;
    
    /**
     * 批量文本分类
     * 比单条调用便宜约80%
     */
    public List<ClassificationResult> batchClassify(
            List<String> texts, 
            List<String> categories) {
        
        if (texts.isEmpty()) return List.of();
        
        // 构建批量分类Prompt
        StringBuilder prompt = new StringBuilder();
        prompt.append("请对以下").append(texts.size()).append("段文本进行分类。\n");
        prompt.append("分类选项：").append(String.join("/", categories)).append("\n\n");
        prompt.append("按JSON数组格式输出，每个元素包含index和category：\n\n");
        
        for (int i = 0; i < texts.size(); i++) {
            prompt.append("文本").append(i).append(": ").append(texts.get(i)).append("\n");
        }
        
        prompt.append("\n输出格式：[{\"index\":0,\"category\":\"分类名\"}, ...]");
        
        String response = llm.generate(prompt.toString());
        
        return parseBatchResults(response, texts.size());
    }
    
    private List<ClassificationResult> parseBatchResults(String response, int expectedCount) {
        try {
            String json = extractJsonArray(response);
            ObjectMapper mapper = new ObjectMapper();
            List<Map<String, Object>> raw = mapper.readValue(
                json, new TypeReference<>() {});
            
            return raw.stream()
                .map(item -> new ClassificationResult(
                    ((Number) item.get("index")).intValue(),
                    (String) item.get("category")
                ))
                .toList();
        } catch (Exception e) {
            log.warn("批量分类结果解析失败: {}", e.getMessage());
            // 解析失败，返回默认分类
            List<ClassificationResult> defaults = new ArrayList<>();
            for (int i = 0; i < expectedCount; i++) {
                defaults.add(new ClassificationResult(i, "未知"));
            }
            return defaults;
        }
    }
    
    private String extractJsonArray(String text) {
        int start = text.indexOf('[');
        int end = text.lastIndexOf(']');
        return start >= 0 && end > start ? text.substring(start, end + 1) : "[]";
    }
    
    public record ClassificationResult(int index, String category) {}
}

优化五：输出长度控制

AI输出默认会比较啰嗦，控制输出长度直接省钱：

/**
 * 输出长度控制
 * 通过Prompt指令控制AI输出的长度
 */
public class OutputLengthController {
    
    /**
     * 在Prompt中添加长度控制指令
     */
    public static String addLengthConstraint(String originalPrompt, OutputMode mode) {
        String constraint = switch (mode) {
            case CONCISE -> "\n\n注意：回答要简洁，控制在100字以内，只给出核心答案。";
            case NORMAL -> "\n\n注意：回答控制在300字以内，重点突出。";
            case DETAILED -> "\n\n请详细说明，但避免重复，控制在800字以内。";
            case ULTRA_SHORT -> "\n\n用一句话回答（30字以内）。";
        };
        
        return originalPrompt + constraint;
    }
    
    /**
     * 根据请求场景自动选择输出模式
     */
    public static OutputMode inferOutputMode(String question, String feature) {
        // 聊天场景通常不需要很长的回答
        if ("chat".equals(feature)) return OutputMode.CONCISE;
        
        // 确认性问题，一句话就够
        if (question.matches(".*是不是.*|.*对不对.*|.*可以吗.*")) {
            return OutputMode.ULTRA_SHORT;
        }
        
        // 分析类问题需要详细
        if (question.contains("分析") || question.contains("解释")) {
            return OutputMode.DETAILED;
        }
        
        return OutputMode.NORMAL;
    }
    
    public enum OutputMode { ULTRA_SHORT, CONCISE, NORMAL, DETAILED }
}

优化六：本地模型兜底

把低风险的请求路由到免费的本地模型：

/**
 * 本地+云端混合模型策略
 * 对低复杂度的请求，优先使用本地免费模型
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class HybridModelService {
    
    private final ChatLanguageModel cloudModel;    // OpenAI/Claude
    private final ChatLanguageModel localModel;   // Ollama本地模型
    
    /**
     * 智能混合调用
     * 先用本地模型，质量不达标再升级到云端
     */
    public String generateWithFallup(String prompt, double qualityThreshold) {
        // 先用本地模型（免费）
        String localResponse = localModel.generate(prompt);
        
        // 质量检查（快速启发式规则，不调用额外API）
        if (isQualitySufficient(localResponse, prompt, qualityThreshold)) {
            log.debug("本地模型回答质量达标，节省云端调用");
            return localResponse;
        }
        
        // 本地不够好，升级到云端
        log.debug("本地模型质量不足，升级到云端模型");
        return cloudModel.generate(prompt);
    }
    
    private boolean isQualitySufficient(String response, String prompt, double threshold) {
        // 启发式质量检查（不需要额外LLM调用）
        
        // 1. 回答不能太短
        if (response.length() < 20) return false;
        
        // 2. 不能是模型不知道的标准回答
        if (response.contains("作为AI") || response.contains("我无法")) return false;
        
        // 3. 回答应该与问题相关（简单词汇重叠检查）
        String[] questionWords = prompt.split("[\\s，。？！]+");
        int overlap = 0;
        for (String word : questionWords) {
            if (word.length() > 1 && response.contains(word)) overlap++;
        }
        double relevance = (double) overlap / questionWords.length;
        
        return relevance >= threshold;
    }
}

优化效果

优化手段	节省比例	实施难度	质量影响
语义缓存	30-40%	中	无
模型路由	40-60%	中	极小
Token压缩	10-15%	低	无
批处理	50-70%	中（需改业务逻辑）	无
输出长度控制	20-30%	低	取决于场景
本地模型兜底	30-50%	高（需要部署）	低风险场景无影响

这六个手段组合使用，我们实现了62%的成本降低。其中语义缓存和模型路由是性价比最高的两个——实施不复杂，效果最显著。