@Service
public class BatchLlmService {
    
    private final ChatClient chatClient;
    private final BatchCollector<ChatRequest, String> batchCollector;
    
    @PostConstruct
    public void initBatchCollector() {
        // 使用Reactor批量收集：等待50ms或攒够8个请求，批量发送
        this.batchCollector = BatchCollector.<ChatRequest, String>builder()
            .maxBatchSize(8)
            .maxWaitTime(Duration.ofMillis(50))
            .batchProcessor(this::processBatch)
            .build();
    }
    
    public Mono<String> chat(String message) {
        return batchCollector.submit(new ChatRequest(message));
    }
    
    private List<String> processBatch(List<ChatRequest> batch) {
        // 将多个请求合并，利用LLM的批量推理能力
        return batch.parallelStream()
            .map(req -> chatClient.prompt().user(req.getMessage()).call().content())
            .collect(Collectors.toList());
    }
}

Semantic Cache：请求级别的缓存优化

相同或高度相似的问题，直接返回缓存答案，完全跳过LLM调用。

@Service
@RequiredArgsConstructor
public class SemanticCacheService {
    
    private final EmbeddingModel embeddingModel;
    private final VectorStore cacheStore;
    private final ChatClient chatClient;
    
    private static final double CACHE_HIT_THRESHOLD = 0.95; // 95%相似度视为命中
    
    public String chat(String question) {
        // 1. 查找语义相似的历史请求
        List<Document> cached = cacheStore.similaritySearch(
            SearchRequest.query(question)
                .withTopK(1)
                .withSimilarityThreshold(CACHE_HIT_THRESHOLD)
        );
        
        if (!cached.isEmpty()) {
            String cachedAnswer = (String) cached.get(0).getMetadata().get("answer");
            log.info("Semantic Cache命中，节省LLM调用");
            return cachedAnswer;
        }
        
        // 2. Cache miss：调用LLM
        String answer = chatClient.prompt().user(question).call().content();
        
        // 3. 存入Cache
        Document cacheDoc = new Document(question, Map.of("answer", answer));
        cacheStore.add(List.of(cacheDoc));
        
        return answer;
    }
}

综合优化效果

优化手段	延迟降低	成本降低	实施难度
Semantic Cache	60-80%	30-50%	★★
动态批处理	20-40%	40-60%	★★★
INT8量化	30-50%	30-40%	★★★★
流式输出（感知优化）	-（提升体验）	-	★
请求并发优化	10-20%	-	★★