@Service
@Slf4j
public class JavaCodebaseIndexer {
    
    private final JavaParser javaParser;
    private final CodeChunkRepository codeChunkRepository;
    private final EmbeddingService embeddingService;
    
    public void indexRepository(Path repositoryRoot) throws IOException {
        log.info("开始索引代码库: {}", repositoryRoot);
        
        try (Stream<Path> javaFiles = Files.walk(repositoryRoot)) {
            List<Path> sourceFiles = javaFiles
                .filter(p -> p.toString().endsWith(".java"))
                .filter(p -> !p.toString().contains("/test/"))
                .filter(p -> !p.toString().contains("generated"))
                .collect(Collectors.toList());
            
            log.info("发现Java文件: {}个", sourceFiles.size());
            
            // 分批处理避免内存溢出
            Lists.partition(sourceFiles, 100).forEach(batch -> {
                batch.forEach(file -> {
                    try {
                        indexFile(file, repositoryRoot);
                    } catch (Exception e) {
                        log.warn("索引文件失败: {}", file, e);
                    }
                });
            });
        }
        
        log.info("代码库索引完成");
    }
    
    private void indexFile(Path filePath, Path repoRoot) throws FileNotFoundException {
        ParseResult<CompilationUnit> parseResult = javaParser.parse(filePath.toFile());
        
        if (!parseResult.isSuccessful()) {
            return;
        }
        
        CompilationUnit cu = parseResult.getResult().orElseThrow();
        String packageName = cu.getPackageDeclaration()
            .map(pkg -> pkg.getNameAsString()).orElse("");
        
        String relativePath = repoRoot.relativize(filePath).toString();
        
        // 提取类级信息
        cu.findAll(ClassOrInterfaceDeclaration.class).forEach(classDecl -> {
            indexClass(classDecl, packageName, relativePath);
        });
    }
    
    private void indexClass(ClassOrInterfaceDeclaration classDecl, 
            String packageName, String filePath) {
        
        String className = classDecl.getNameAsString();
        String classComment = classDecl.getJavadoc()
            .map(doc -> doc.getDescription().toText())
            .orElse("");
        
        // 提取类的注解（用于判断是Controller/Service/Repository等）
        List<String> annotations = classDecl.getAnnotations().stream()
            .map(ann -> ann.getNameAsString())
            .collect(Collectors.toList());
        
        String classRole = determineClassRole(annotations);
        
        // 为每个方法创建索引块
        classDecl.getMethods().forEach(method -> {
            indexMethod(method, className, classRole, packageName, filePath);
        });
        
        // 类级别也创建一个索引块（包含类的整体信息）
        String classChunkContent = buildClassChunk(classDecl, packageName);
        CodeChunk classChunk = CodeChunk.builder()
            .chunkType("CLASS")
            .className(className)
            .packageName(packageName)
            .filePath(filePath)
            .content(classChunkContent)
            .annotations(annotations)
            .classRole(classRole)
            .build();
        
        saveAndEmbed(classChunk);
    }
    
    private void indexMethod(MethodDeclaration method, String className, 
            String classRole, String packageName, String filePath) {
        
        String methodName = method.getNameAsString();
        String javadocText = method.getJavadoc()
            .map(doc -> doc.toText())
            .orElse("");
        
        // 提取方法调用关系
        List<String> methodCalls = extractMethodCalls(method);
        List<String> usedClasses = extractUsedClasses(method);
        
        // 构建索引文本：把代码和注释混合，提高语义搜索效果
        String chunkContent = buildMethodChunk(method, javadocText, 
            className, classRole);
        
        CodeChunk methodChunk = CodeChunk.builder()
            .chunkType("METHOD")
            .className(className)
            .methodName(methodName)
            .packageName(packageName)
            .filePath(filePath)
            .content(chunkContent)
            .methodCalls(methodCalls)
            .usedClasses(usedClasses)
            .returnType(method.getTypeAsString())
            .classRole(classRole)
            .build();
        
        saveAndEmbed(methodChunk);
    }
    
    private String buildMethodChunk(MethodDeclaration method, String javadoc,
            String className, String classRole) {
        StringBuilder sb = new StringBuilder();
        
        // 上下文信息（这部分对语义搜索很重要）
        sb.append("// 所在类: ").append(className)
          .append(" (").append(classRole).append(")\n");
        
        if (!javadoc.isEmpty()) {
            sb.append("// 功能说明: ").append(javadoc).append("\n");
        }
        
        // 方法签名
        sb.append(method.getDeclarationAsString(true, true, true)).append(" {\n");
        
        // 方法体（截取前30行避免太长）
        String methodBody = method.getBody()
            .map(body -> body.toString())
            .orElse("// 接口方法，无实现");
        
        String[] bodyLines = methodBody.split("\n");
        int maxLines = Math.min(30, bodyLines.length);
        for (int i = 0; i < maxLines; i++) {
            sb.append(bodyLines[i]).append("\n");
        }
        if (bodyLines.length > maxLines) {
            sb.append("// ... 更多代码\n");
        }
        sb.append("}");
        
        return sb.toString();
    }
    
    private List<String> extractMethodCalls(MethodDeclaration method) {
        List<String> calls = new ArrayList<>();
        method.findAll(MethodCallExpr.class).forEach(call -> {
            calls.add(call.getNameAsString());
        });
        return calls;
    }
    
    private String determineClassRole(List<String> annotations) {
        if (annotations.contains("RestController") || annotations.contains("Controller")) 
            return "Web控制器";
        if (annotations.contains("Service")) return "业务服务";
        if (annotations.contains("Repository")) return "数据访问层";
        if (annotations.contains("Component")) return "Spring组件";
        if (annotations.contains("Configuration")) return "配置类";
        if (annotations.contains("Entity")) return "数据实体";
        return "Java类";
    }
    
    private void saveAndEmbed(CodeChunk chunk) {
        // 生成embedding
        float[] embedding = embeddingService.embed(chunk.getContent());
        chunk.setEmbedding(embedding);
        codeChunkRepository.save(chunk);
    }
}

问答引擎实现

@Service
@Slf4j
public class CodebaseQAEngine {
    
    private final CodeChunkRepository codeChunkRepository;
    private final AnthropicClient anthropicClient;
    private final EmbeddingService embeddingService;
    private final CallGraphService callGraphService;
    
    public CodebaseAnswer answer(String question) {
        log.info("处理代码库问题: {}", question);
        
        // 第一步：理解问题，提取关键实体
        QuestionAnalysis analysis = analyzeQuestion(question);
        
        // 第二步：多路检索
        List<CodeChunk> relevantChunks = multiPathSearch(analysis);
        
        // 第三步：扩展调用链上下文
        List<CodeChunk> enrichedChunks = enrichWithCallGraph(relevantChunks, analysis);
        
        // 第四步：组装上下文，生成答案
        String answer = generateAnswer(question, enrichedChunks);
        
        // 第五步：提取引用
        List<CodeReference> references = buildReferences(enrichedChunks);
        
        return CodebaseAnswer.builder()
            .question(question)
            .answer(answer)
            .references(references)
            .build();
    }
    
    private QuestionAnalysis analyzeQuestion(String question) {
        String analysisPrompt = String.format("""
            分析以下关于代码库的问题，提取关键信息。
            
            问题：%s
            
            输出JSON：
            {
              "question_type": "FLOW_UNDERSTANDING|IMPLEMENTATION_DETAIL|ARCHITECTURE|DEBUGGING|HOW_TO",
              "key_entities": ["类名", "方法名", "业务概念"],
              "search_keywords": ["搜索关键词1", "搜索关键词2"],
              "expected_answer_depth": "HIGH|MEDIUM|LOW"
            }
            """, question);
        
        String response = anthropicClient.complete(analysisPrompt);
        return parseQuestionAnalysis(response);
    }
    
    private List<CodeChunk> multiPathSearch(QuestionAnalysis analysis) {
        Set<CodeChunk> allChunks = new LinkedHashSet<>();
        
        // 路径1：语义向量搜索
        float[] queryEmbedding = embeddingService.embed(
            String.join(" ", analysis.getKeyEntities()) + " " + 
            String.join(" ", analysis.getSearchKeywords())
        );
        List<CodeChunk> semanticResults = codeChunkRepository
            .findByEmbeddingSimilarity(queryEmbedding, 8);
        allChunks.addAll(semanticResults);
        
        // 路径2：类名/方法名精确匹配
        analysis.getKeyEntities().forEach(entity -> {
            List<CodeChunk> exactMatches = codeChunkRepository
                .findByClassNameOrMethodNameContaining(entity);
            allChunks.addAll(exactMatches);
        });
        
        // 路径3：关键词全文搜索
        analysis.getSearchKeywords().forEach(keyword -> {
            List<CodeChunk> keywordMatches = codeChunkRepository
                .findByContentContaining(keyword);
            allChunks.addAll(keywordMatches.stream().limit(3).collect(Collectors.toList()));
        });
        
        return new ArrayList<>(allChunks).stream().limit(12).collect(Collectors.toList());
    }
    
    /**
     * 根据调用关系图，补充上下文代码
     * 例如：找到了PaymentService.pay()，自动补充它调用的OrderService.updateStatus()
     */
    private List<CodeChunk> enrichWithCallGraph(List<CodeChunk> chunks, 
            QuestionAnalysis analysis) {
        
        if (analysis.getQuestionType() != QuestionType.FLOW_UNDERSTANDING) {
            return chunks;  // 只有流程理解类问题才需要调用链扩展
        }
        
        Set<CodeChunk> enriched = new LinkedHashSet<>(chunks);
        
        chunks.stream()
            .filter(c -> c.getChunkType().equals("METHOD"))
            .forEach(chunk -> {
                // 查找这个方法调用的其他方法
                List<CodeChunk> calledMethods = codeChunkRepository
                    .findByMethodNameIn(chunk.getMethodCalls());
                
                // 最多扩展2层
                calledMethods.stream().limit(3).forEach(callee -> {
                    enriched.add(callee);
                });
            });
        
        return new ArrayList<>(enriched);
    }
    
    private String generateAnswer(String question, List<CodeChunk> chunks) {
        StringBuilder contextBuilder = new StringBuilder();
        
        for (int i = 0; i < chunks.size(); i++) {
            CodeChunk chunk = chunks.get(i);
            contextBuilder.append(String.format("[代码引用%d] %s.%s (%s)\n",
                i + 1, chunk.getClassName(), 
                chunk.getMethodName() != null ? chunk.getMethodName() : "",
                chunk.getFilePath()));
            contextBuilder.append("```java\n");
            contextBuilder.append(chunk.getContent());
            contextBuilder.append("\n```\n\n");
        }
        
        String prompt = String.format("""
            你是一个深入了解以下Java项目代码库的技术顾问。
            
            请根据提供的代码片段，回答新成员的问题。
            
            回答要求：
            1. 用清晰的中文解释，不要照搬代码
            2. 从业务逻辑角度解释，不只是技术细节
            3. 引用具体的代码时，使用[代码引用N]格式标注来源
            4. 如果涉及流程，用步骤编号描述流程
            5. 如果代码片段信息不足以回答，明确说明
            
            代码片段：
            %s
            
            问题：%s
            """, contextBuilder, question);
        
        return anthropicClient.complete(prompt);
    }
}

实战效果：问答示例

问题："我们的订单超时取消是怎么实现的？"

系统返回：

订单超时取消通过定时任务实现，主要流程如下：
触发机制：OrderTimeoutJob（[代码引用1]）使用Spring的@Scheduled注解，每5分钟执行一次扫描
扫描逻辑：在OrderTimeoutChecker.checkExpiredOrders()（[代码引用2]）中，查询状态为CREATED且创建时间超过30分钟的订单
取消处理：调用OrderService.cancelOrder()（[代码引用3]）进行取消，内部会：
更新订单状态为CANCELLED
发送订单取消MQ消息（OrderCancelledEvent）
触发库存归还（通过监听MQ消息的InventoryEventListener处理）
注意事项：取消操作使用了分布式锁（[代码引用4]），防止并发情况下重复取消

踩坑：代码理解的"幻觉"问题

有个问题让我头疼了好一阵：系统有时候会"编造"代码里没有的逻辑。

比如用户问"订单支付失败会不会自动重试"，系统结合了几段相关代码，生成了一个听起来很合理的回答说"系统会在5分钟后自动重试3次"。但实际上代码里根本没有自动重试逻辑。

这是因为LLM对代码的"合理推断"有时候超出了实际代码的范围。

解决方法：在Prompt里加一个强约束——

"如果代码片段中没有明确体现某个行为，必须说'代码中未找到相关实现，建议确认'，不要根据经验推断。"

这个约束加上去之后，幻觉问题大幅减少了，但代价是系统有时候会更保守地说"信息不足"。这个权衡是值得的——宁可让用户再问一句，也不要给错误信息。

持续更新：增量索引

代码库在持续变化，索引也要跟着更新：

@Service
public class IncrementalIndexService {
    
    /**
     * 只重新索引有变更的文件
     */
    @EventListener
    public void onGitPush(GitPushEvent event) {
        List<String> changedFiles = event.getChangedFiles().stream()
            .filter(f -> f.endsWith(".java"))
            .collect(Collectors.toList());
        
        if (changedFiles.isEmpty()) return;
        
        log.info("检测到{}个Java文件变更，开始增量索引", changedFiles.size());
        
        changedFiles.forEach(filePath -> {
            // 删除旧的索引块
            codeChunkRepository.deleteByFilePath(filePath);
            
            // 重新索引
            try {
                javaCodebaseIndexer.indexFile(
                    Paths.get(filePath), 
                    Paths.get(event.getRepoRoot())
                );
            } catch (Exception e) {
                log.error("增量索引失败: {}", filePath, e);
            }
        });
    }
}

总结

代码库问答系统的核心价值，不只是帮新人快速上手，更是把那些只存在老员工脑子里的隐性知识，通过代码本身显性化出来。

代码是最真实的文档，只是之前没有工具让你方便地"问"它。RAG+LLM的组合，让这件事第一次变得可行。