第2390篇：RAG系统的延迟优化进阶——从秒级到毫秒级的工程路径

老张大约 6 分钟

第2390篇：RAG系统的延迟优化进阶——从秒级到毫秒级的工程路径

适读人群：需要将RAG延迟压缩到毫秒级的工程师 | 阅读时长：约20分钟 | 核心价值：系统性掌握RAG全链路延迟优化，包括预计算、缓存层次设计和硬件加速方案

我们有个B端产品，要求RAG系统必须在1秒内给出回答。

乍一听很难，实际上做到了。但过程中走了不少弯路。最开始我们到处加缓存，但发现缓存命中率只有20%（企业用户的问题各式各样），加了缓存效果不明显。

真正有效的优化，是先建立性能拆分视图，弄清楚每个环节耗时多少，再有针对性地优化。

性能拆分：找到真正的瓶颈

@Service
public class RAGPerformanceProfiler {

    /**
     * 详细的性能追踪
     * 每个子步骤都记录时间
     */
    public RAGTraceResult traceExecution(String question) {
        RAGTrace trace = new RAGTrace(question);

        // 步骤1：查询向量化
        trace.startStep("query_embedding");
        float[] queryVector = embeddingModel.embed(question);
        trace.endStep("query_embedding");

        // 步骤2：向量检索
        trace.startStep("vector_search");
        List<Document> docs = vectorStore.similaritySearch(
            SearchRequest.query(question).withTopK(5)
        );
        trace.endStep("vector_search");

        // 步骤3：Reranking（如果有）
        trace.startStep("reranking");
        docs = rerankIfEnabled(docs, question);
        trace.endStep("reranking");

        // 步骤4：Prompt构建
        trace.startStep("prompt_building");
        String prompt = promptBuilder.build(question, docs);
        trace.endStep("prompt_building");

        // 步骤5：LLM生成
        trace.startStep("llm_generation");
        String answer = chatClient.prompt(prompt).call().content();
        trace.endStep("llm_generation");

        return trace.buildResult(answer);
    }
}

/**
 * 典型的延迟分布（毫秒）：
 * 
 * query_embedding:  50-200ms  (可优化：模型选择、硬件)
 * vector_search:   100-500ms  (可优化：索引类型、缓存)
 * reranking:       200-800ms  (可优化：是否真的需要)
 * prompt_building:  1-10ms   (基本不需要优化)
 * llm_generation: 500-3000ms  (可优化：模型、流式、缓存)
 *
 * 总计：851-4510ms
 * 优化目标：<1000ms
 */

优化层次一：Embedding缓存

@Service
public class CachedEmbeddingService {

    // 内存缓存：热点查询的embedding
    private final Cache<String, float[]> embeddingCache;
    
    // Redis缓存：跨进程共享
    private final RedisTemplate<String, float[]> redisCache;

    /**
     * 三级embedding缓存
     * 
     * L1：本地内存（最快，空间有限）
     * L2：Redis（较快，跨进程共享）
     * L3：重新计算（最慢，只在缓存miss时）
     */
    public float[] getEmbedding(String text) {
        // L1：内存缓存
        float[] cached = embeddingCache.getIfPresent(text);
        if (cached != null) {
            metrics.recordCacheHit("l1_embedding");
            return cached;
        }

        // L2：Redis缓存
        String cacheKey = "emb:" + hashText(text);
        float[] redisCached = redisCache.opsForValue().get(cacheKey);
        if (redisCached != null) {
            embeddingCache.put(text, redisCached);  // 回填L1
            metrics.recordCacheHit("l2_embedding");
            return redisCached;
        }

        // L3：计算
        float[] embedding = embeddingModel.embed(text);
        
        // 写入两级缓存
        embeddingCache.put(text, embedding);
        redisCache.opsForValue().set(cacheKey, embedding, 24, TimeUnit.HOURS);
        
        metrics.recordCacheMiss("embedding");
        return embedding;
    }

    /**
     * 批量预计算热点查询的embedding
     * 系统启动时执行
     */
    public void precomputeHotQueryEmbeddings() {
        List<String> hotQueries = analyticsService.getHotQueriesLastWeek(500);
        
        log.info("Pre-computing embeddings for {} hot queries", hotQueries.size());
        
        // 批量计算（向量化服务支持批量请求时，比逐个请求快）
        List<float[]> embeddings = embeddingModel.embedBatch(hotQueries);
        
        for (int i = 0; i < hotQueries.size(); i++) {
            String key = "emb:" + hashText(hotQueries.get(i));
            redisCache.opsForValue().set(key, embeddings.get(i), 24, TimeUnit.HOURS);
        }
        
        log.info("Pre-computed {} embeddings", hotQueries.size());
    }
}

优化层次二：向量检索加速

@Configuration
public class VectorSearchOptimizationConfig {

    /**
     * 向量索引类型的选择对性能影响极大
     * 
     * FLAT（精确）：准确但慢，不适合大规模
     * IVF（倒排文件）：快，准确度略低，适合百万级
     * HNSW（分层导航小世界）：很快，准确度高，适合大多数场景
     * 
     * 对于<1秒的延迟要求，必须用HNSW或类似的近似索引
     */
    @Bean
    public VectorStoreIndexConfig indexConfig() {
        return VectorStoreIndexConfig.builder()
            .indexType(IndexType.HNSW)
            .hnswM(16)          // 控制图的连接数，越大越准确但越慢
            .hnswEfConstruction(200)  // 构建时的搜索范围，越大越准确
            .hnswEf(50)         // 查询时的搜索范围
            .build();
    }
}

@Service
public class AdaptiveSearchService {

    /**
     * 自适应检索深度
     * 
     * 关键洞察：不是所有查询都需要TopK=10
     * 简单问题TopK=3就够了
     * 只有复杂问题才需要更多文档
     */
    public List<Document> adaptiveSearch(String question) {
        // 先用小K快速检索
        List<Document> quickResults = vectorStore.similaritySearch(
            SearchRequest.query(question).withTopK(3)
        );

        // 如果Top-1相似度很高（>0.85），不需要更多结果
        float topScore = getTopScore(quickResults);
        if (topScore > 0.85f) {
            return quickResults;  // 快速返回，节省时间
        }

        // 相似度不高，扩大搜索范围
        return vectorStore.similaritySearch(
            SearchRequest.query(question).withTopK(8)
        );
    }
}

优化层次三：Reranking的取舍

@Service
public class SelectiveRerankingService {

    /**
     * 不是每个查询都需要Reranking
     * 
     * Reranking耗时200-800ms，但提升有限
     * 对于向量检索已经很好的查询，Reranking反而浪费时间
     * 
     * 策略：只对相似度中等、结果质量不确定的查询做Reranking
     */
    public List<Document> rerankIfNecessary(String question, List<Document> docs) {
        if (docs.isEmpty()) return docs;
        
        float topScore = getTopScore(docs);
        float lowestScore = getLowestScore(docs);
        
        // 情况1：最高分很高（>0.85），不需要Reranking
        if (topScore > 0.85f) {
            metrics.recordRerankSkipped("high_top_score");
            return docs;
        }
        
        // 情况2：最高分和最低分差距小（所有文档相关性差不多），Reranking帮助不大
        if (topScore - lowestScore < 0.1f) {
            metrics.recordRerankSkipped("small_score_gap");
            return docs;
        }
        
        // 情况3：相似度中等，且结果质量不确定，做Reranking
        metrics.recordRerankExecuted();
        return rerankService.rerank(question, docs);
    }
}

优化层次四：LLM生成的加速

@Service
public class LLMGenerationOptimizer {

    /**
     * 答案缓存：相同问题不重复生成
     * 
     * 注意：RAG的缓存比普通Chat复杂，
     * 因为即使是相同的问题，如果知识库更新了，答案应该不同
     * 缓存key需要包含：问题 + 检索到的文档IDs
     */
    private final Cache<String, String> answerCache;

    public String generateWithCache(String question, List<Document> docs) {
        // 缓存key：问题 + 文档IDs的hash（不是文档内容，文档内容太大）
        String cacheKey = buildCacheKey(question, docs);
        
        String cached = answerCache.getIfPresent(cacheKey);
        if (cached != null) {
            metrics.recordCacheHit("answer");
            return cached;
        }

        String prompt = promptBuilder.build(question, docs);
        String answer = chatClient.prompt(prompt).call().content();
        
        answerCache.put(cacheKey, answer);
        return answer;
    }

    /**
     * 使用小模型处理简单问题
     * 
     * 大模型（GPT-4）：准确，但慢（1-3s）
     * 小模型（GPT-3.5或本地模型）：稍差，但快（200-500ms）
     * 
     * 对简单问题用小模型，复杂问题才用大模型
     */
    public String generateWithAdaptiveModel(String question, List<Document> docs) {
        QuestionComplexity complexity = complexityClassifier.classify(question);
        
        String modelId = switch (complexity) {
            case SIMPLE -> "gpt-3.5-turbo";     // 200-500ms
            case MEDIUM -> "gpt-3.5-turbo";     // 200-500ms
            case COMPLEX -> "gpt-4o-mini";       // 500-1000ms
            case VERY_COMPLEX -> "gpt-4o";       // 1000-3000ms
        };

        String prompt = promptBuilder.build(question, docs);
        return chatClient.model(modelId).prompt(prompt).call().content();
    }
}

实现1秒以内的端到端优化

@Service
public class SubSecondRAGService {

    /**
     * 面向<1秒响应的RAG实现
     * 
     * 关键优化组合：
     * 1. Embedding缓存（节省50-200ms）
     * 2. HNSW索引（向量检索<100ms）
     * 3. 自适应TopK（节省不必要的检索）
     * 4. 按需Reranking（大多数场景跳过）
     * 5. 小模型 + 答案缓存（LLM生成<500ms）
     */
    public RAGResult answerFast(String question) {
        long start = System.currentTimeMillis();

        // 1. 向量化（带缓存，热点查询约5ms，冷请求约50ms）
        float[] queryVector = cachedEmbeddingService.getEmbedding(question);
        log.debug("Embedding: {}ms", System.currentTimeMillis() - start);

        // 2. 检索（HNSW索引，约50-100ms）
        long searchStart = System.currentTimeMillis();
        List<Document> docs = adaptiveSearchService.adaptiveSearch(question);
        log.debug("Search: {}ms", System.currentTimeMillis() - searchStart);

        // 3. 按需Reranking（大多数场景跳过）
        long rerankStart = System.currentTimeMillis();
        docs = selectiveRerankingService.rerankIfNecessary(question, docs);
        log.debug("Rerank: {}ms", System.currentTimeMillis() - rerankStart);

        // 4. 答案生成（带缓存和自适应模型选择）
        long genStart = System.currentTimeMillis();
        String answer = llmGenerationOptimizer.generateWithAdaptiveModel(question, docs);
        log.debug("Generation: {}ms", System.currentTimeMillis() - genStart);

        long totalMs = System.currentTimeMillis() - start;
        log.info("Total RAG latency: {}ms", totalMs);

        return RAGResult.of(answer, docs, totalMs);
    }
}

延迟优化的预期收益

优化措施	节省时间	命中率要求
Embedding缓存	50-200ms	命中率>30%有效
HNSW索引	100-400ms	构建一次持续受益
跳过Reranking	200-800ms	约70%查询可跳过
小模型路由	500-2000ms	约50%查询用小模型
答案缓存	全量节省	命中率>20%有效

合理组合这些优化，P90延迟从4-5秒降到1秒以内是可以做到的。关键是要先做性能拆分，知道每个环节的耗时，再决定优先优化哪里。