第2093篇：领域适配的工程路径——何时微调、何时RAG、何时Prompt工程

老张大约 10 分钟

第2093篇：领域适配的工程路径——何时微调、何时RAG、何时Prompt工程

适读人群：面临AI能力落地选择的工程师 | 阅读时长：约19分钟 | 核心价值：建立清晰的领域适配决策框架，理解微调（Fine-tuning）、RAG、Prompt工程三种方案的适用场景和实施路径

"我们的领域很特殊，通用模型效果不好，是不是要微调？"

这个问题我被问过很多次。大多数时候，答案是"不需要"——但这不是因为微调不好，而是因为很多团队在还没弄清楚问题根源之前就想跳到微调方案，结果花了几个月和大量算力，效果还不如一个好的Prompt。

这篇文章给出一个实用的决策框架。

三种方案的本质区别

决策框架

/**
 * 领域适配方案决策器
 * 帮助工程师快速定位应该用哪种方案
 */
public class DomainAdaptationDecisionFramework {
    
    /**
     * 根据需求特征，给出方案推荐
     */
    public static AdaptationRecommendation recommend(DomainRequirement requirement) {
        
        // 场景1：主要是知识缺失问题
        if (requirement.isKnowledgeGap()) {
            if (requirement.isKnowledgeDynamic()) {
                // 知识会更新 → 必须用RAG（微调的知识会过时）
                return AdaptationRecommendation.of(
                    "RAG（检索增强生成）",
                    """
                    模型缺乏领域知识，且知识会定期更新。
                    RAG方案：构建领域知识库，检索后注入Prompt。
                    优势：知识可实时更新，无需重新训练。
                    """,
                    Priority.FIRST_CHOICE
                );
            } else {
                // 知识相对稳定 → RAG仍然优先，微调作为补充
                return AdaptationRecommendation.of(
                    "RAG（优先）或 Knowledge微调（可选）",
                    """
                    模型缺乏专业知识，知识相对稳定。
                    推荐先用RAG，如果RAG效果不佳再考虑在语料上继续预训练。
                    """,
                    Priority.FIRST_CHOICE
                );
            }
        }
        
        // 场景2：主要是格式/风格问题
        if (requirement.isStyleOrFormatIssue()) {
            if (requirement.hasEnoughExamples()) {
                // 有足够的例子 → Few-shot或微调
                return AdaptationRecommendation.of(
                    "Few-shot Prompt工程（优先）或 Instruction微调",
                    """
                    需要特定的输出格式或风格。
                    首先尝试Few-shot示例（在Prompt中给3-5个示例），通常已足够。
                    如果Few-shot效果不稳定，再考虑Instruction Fine-tuning。
                    """,
                    Priority.TRY_PROMPT_FIRST
                );
            } else {
                return AdaptationRecommendation.of(
                    "Few-shot Prompt工程",
                    """
                    需要特定格式，样本不足以支撑微调。
                    用Few-shot示例定义期望输出格式，配合清晰的格式说明。
                    """,
                    Priority.FIRST_CHOICE
                );
            }
        }
        
        // 场景3：需要特殊推理能力
        if (requirement.isSpecializedReasoning()) {
            if (requirement.getTrainingSampleCount() >= 1000) {
                return AdaptationRecommendation.of(
                    "指令微调（Instruction Fine-tuning）",
                    """
                    需要专业领域的推理能力，且有足够高质量样本（1000+）。
                    微调可以让模型学习该领域的思维方式。
                    建议使用LoRA降低成本，基础模型选较小的7B/13B模型。
                    """,
                    Priority.VIABLE
                );
            } else {
                return AdaptationRecommendation.of(
                    "Chain-of-thought Prompt工程",
                    """
                    需要专业推理，但样本不足以微调。
                    使用CoT Prompt，展示详细的推理步骤，引导模型模仿。
                    同时积累高质量样本，等数量够了再考虑微调。
                    """,
                    Priority.FIRST_CHOICE
                );
            }
        }
        
        // 默认推荐
        return AdaptationRecommendation.of(
            "Prompt工程",
            "先优化Prompt，再决定是否需要更重量级的方案。",
            Priority.FIRST_CHOICE
        );
    }
    
    public enum Priority { FIRST_CHOICE, TRY_PROMPT_FIRST, VIABLE, LAST_RESORT }
}

Prompt工程：正确的优先顺序

/**
 * 系统化的Prompt优化方法
 * 大多数"需要微调"的问题，其实是Prompt没写好
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class PromptOptimizationService {
    
    private final ChatLanguageModel llm;
    
    /**
     * 逐步升级Prompt的四个层次
     */
    public enum PromptLevel {
        
        BASIC("""
            最基础的Prompt
            - 只描述任务
            - 不给示例，不指定格式
            适用：通用任务
            """),
        
        STRUCTURED("""
            结构化Prompt
            - 明确说明角色（你是一个...）
            - 具体的输出格式要求
            - 明确的约束条件
            适用：大多数业务场景
            """),
        
        FEW_SHOT("""
            Few-shot示例
            - 在Prompt中包含3-5个输入→输出示例
            - 示例覆盖不同情况
            适用：需要特定格式、风格一致
            """),
        
        CHAIN_OF_THOUGHT("""
            Chain-of-Thought
            - 要求模型逐步推理
            - 示例中展示推理过程
            适用：复杂推理、数学计算
            """);
    }
    
    /**
     * 对同一任务，比较不同Prompt层次的效果
     */
    public PromptComparisonResult compare(
            String task,
            String testInput,
            String expectedOutput) {
        
        Map<PromptLevel, String> prompts = buildPrompts(task, testInput);
        Map<PromptLevel, String> results = new LinkedHashMap<>();
        
        for (Map.Entry<PromptLevel, String> entry : prompts.entrySet()) {
            String response = llm.generate(entry.getValue()).trim();
            results.put(entry.getKey(), response);
        }
        
        // 计算与期望输出的相似度（简化实现）
        Map<PromptLevel, Double> scores = results.entrySet().stream()
            .collect(Collectors.toMap(
                Map.Entry::getKey,
                e -> calculateSimilarity(e.getValue(), expectedOutput)
            ));
        
        PromptLevel bestLevel = scores.entrySet().stream()
            .max(Map.Entry.comparingByValue())
            .map(Map.Entry::getKey)
            .orElse(PromptLevel.BASIC);
        
        return new PromptComparisonResult(results, scores, bestLevel);
    }
    
    private Map<PromptLevel, String> buildPrompts(String task, String input) {
        // 简化示例
        Map<PromptLevel, String> prompts = new LinkedHashMap<>();
        
        prompts.put(PromptLevel.BASIC, 
            task + "\n\n输入：" + input);
        
        prompts.put(PromptLevel.STRUCTURED, 
            "你是一个专业助手。" + task + "\n\n请以结构化方式回答。\n\n输入：" + input);
        
        // Few-shot和CoT需要具体的示例，这里只做占位
        prompts.put(PromptLevel.FEW_SHOT,
            "[Few-shot示例]\n\n" + task + "\n\n输入：" + input);
        
        return prompts;
    }
    
    private double calculateSimilarity(String output, String expected) {
        // 简单的关键词重叠率
        String[] outputWords = output.toLowerCase().split("\\s+");
        String[] expectedWords = expected.toLowerCase().split("\\s+");
        Set<String> outputSet = Set.of(outputWords);
        long overlap = Arrays.stream(expectedWords)
            .filter(outputSet::contains).count();
        return expectedWords.length > 0 ? (double) overlap / expectedWords.length : 0;
    }
    
    public record PromptComparisonResult(
        Map<PromptLevel, String> outputs,
        Map<PromptLevel, Double> scores,
        PromptLevel bestLevel
    ) {}
}

RAG的适用边界

/**
 * RAG方案的适用性评估
 * 帮助判断当前场景是否适合RAG，以及RAG的预期效果
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class RagApplicabilityAssessor {
    
    /**
     * 评估RAG在当前场景的适用性
     */
    public RagApplicabilityReport assess(RagScenario scenario) {
        List<String> pros = new ArrayList<>();
        List<String> cons = new ArrayList<>();
        List<String> prerequisites = new ArrayList<>();
        
        // 适合RAG的信号
        if (scenario.isKnowledgeFrequentlyUpdated()) {
            pros.add("知识频繁更新 → RAG比微调更能适应变化");
        }
        
        if (scenario.isKnowledgeSourceDocumentary()) {
            pros.add("知识来源于文档 → 可以直接作为RAG的知识库");
        }
        
        if (scenario.getKnowledgeVolumeWords() > 100_000) {
            pros.add("知识量大（>10万字）→ 微调成本高，RAG更经济");
        }
        
        if (scenario.isAnswerRequiresCitation()) {
            pros.add("答案需要引用来源 → RAG天然支持溯源");
        }
        
        // 不适合RAG或需要注意的情况
        if (scenario.isQueryRequiresComplexReasoning()) {
            cons.add("复杂推理场景：RAG检索的片段可能不足以支持深度推理，" +
                     "需要配合CoT或考虑微调");
        }
        
        if (scenario.getKnowledgeVolumeWords() < 5000) {
            cons.add("知识量小（<5000字）→ 可以直接放进System Prompt，" +
                     "不需要RAG的复杂性");
        }
        
        if (scenario.isQueryImplicit()) {
            cons.add("查询隐含意图：用户问法模糊时，检索可能召回无关内容，" +
                     "需要查询改写");
        }
        
        // 必要前提
        if (scenario.getKnowledgeVolumeWords() > 10_000) {
            prerequisites.add("需要选择向量数据库（pgvector/Qdrant）");
            prerequisites.add("需要选择Embedding模型（推荐bge-m3或text-embedding-3）");
        }
        
        prerequisites.add("需要准备评估数据集（至少50个问答对）");
        prerequisites.add("需要设置检索质量基线（Context Recall和Precision）");
        
        double suitabilityScore = calculateSuitabilityScore(pros, cons);
        
        String recommendation;
        if (suitabilityScore >= 0.7) {
            recommendation = "强烈推荐RAG";
        } else if (suitabilityScore >= 0.4) {
            recommendation = "RAG可行，但需要注意上述局限";
        } else {
            recommendation = "RAG不是最佳选择，考虑Prompt工程或微调";
        }
        
        return new RagApplicabilityReport(pros, cons, prerequisites, 
            suitabilityScore, recommendation);
    }
    
    private double calculateSuitabilityScore(List<String> pros, List<String> cons) {
        return Math.max(0, Math.min(1.0, 
            (double)(pros.size() * 2 - cons.size()) / (pros.size() * 2 + 1)));
    }
    
    public record RagApplicabilityReport(
        List<String> pros, List<String> cons, List<String> prerequisites,
        double suitabilityScore, String recommendation
    ) {}
}

微调的成本与收益分析

/**
 * 微调成本估算和ROI分析
 * 帮助决策者判断微调是否值得
 */
@Service
@Slf4j
public class FineTuningCostAnalysis {
    
    /**
     * 估算LoRA微调的成本
     */
    public FineTuningCostEstimate estimate(FineTuningConfig config) {
        
        // 数据准备成本（通常被低估）
        double dataPreparationCost = estimateDataCost(config.getSampleCount());
        
        // 训练计算成本
        double trainingCost = estimateTrainingCost(
            config.getBaseModelParams(), 
            config.getSampleCount(),
            config.getEpochs()
        );
        
        // 评估成本（通常需要多次迭代）
        double evaluationCost = trainingCost * 0.3;  // 评估约是训练的30%
        
        // 部署成本（需要自己维护推理服务）
        double monthlyDeploymentCost = estimateDeploymentCost(config.getBaseModelParams());
        
        // 总成本（6个月摊销）
        double totalOneTimeCost = dataPreparationCost + trainingCost + evaluationCost;
        double totalCost6Months = totalOneTimeCost + monthlyDeploymentCost * 6;
        
        // 和API调用成本对比
        double apiCostMonthly = config.getMonthlyRequestCount() * 
            config.getAvgTokensPerRequest() / 1000 * 0.005;  // GPT-4o价格估算
        double apiCost6Months = apiCostMonthly * 6;
        
        boolean isWorthIt = totalCost6Months < apiCost6Months;
        
        return FineTuningCostEstimate.builder()
            .dataPreparationCost(dataPreparationCost)
            .trainingCost(trainingCost)
            .evaluationCost(evaluationCost)
            .monthlyDeploymentCost(monthlyDeploymentCost)
            .totalOneTimeCost(totalOneTimeCost)
            .totalCost6Months(totalCost6Months)
            .apiCost6Months(apiCost6Months)
            .isEconomicallyViable(isWorthIt)
            .breakEvenMonths(isWorthIt ? 
                (int)(totalOneTimeCost / Math.max(1, apiCostMonthly - monthlyDeploymentCost)) : -1)
            .recommendation(buildRecommendation(isWorthIt, config))
            .build();
    }
    
    private double estimateDataCost(int sampleCount) {
        // 每个高质量样本需要0.5-2小时人工标注
        // 按照每小时200元估算
        double hoursPerSample = 1.0;
        return sampleCount * hoursPerSample * 200;
    }
    
    private double estimateTrainingCost(long modelParams, int sampleCount, int epochs) {
        // LoRA训练的计算量估算
        // A100 GPU云服务约10-20元/小时
        // 7B模型训练1000个样本1个epoch约需1小时
        double baseHours = (modelParams / 7_000_000_000.0) * (sampleCount / 1000.0) * epochs;
        double gpuHourCost = 15;  // 元/小时（A100）
        return baseHours * gpuHourCost * 3;  // 通常需要多次尝试
    }
    
    private double estimateDeploymentCost(long modelParams) {
        // 自部署的月成本
        // 7B模型需要约16GB显存，用A10G (24GB) 约2500元/月
        if (modelParams <= 7_000_000_000L) return 2500;
        if (modelParams <= 13_000_000_000L) return 4000;
        return 8000;  // 70B级别
    }
    
    private String buildRecommendation(boolean isViable, FineTuningConfig config) {
        if (!isViable) {
            return "在" + 6 + "个月内，使用API调用（" + 
                String.format("%.0f元", config.getMonthlyRequestCount() * 
                    config.getAvgTokensPerRequest() / 1000 * 0.005 * 6) +
                "）比自微调部署更经济。建议先优化Prompt或使用RAG。";
        }
        return "微调后自部署经济上更合算，但需要投入团队能力建设（GPU集群管理、模型训练经验）。";
    }
    
    @Builder
    public record FineTuningCostEstimate(
        double dataPreparationCost, double trainingCost, double evaluationCost,
        double monthlyDeploymentCost, double totalOneTimeCost, double totalCost6Months,
        double apiCost6Months, boolean isEconomicallyViable, int breakEvenMonths,
        String recommendation
    ) {}
    
    @Builder
    public static class FineTuningConfig {
        private int sampleCount;
        private long baseModelParams;  // 7B → 7_000_000_000L
        private int epochs;
        private long monthlyRequestCount;
        private int avgTokensPerRequest;
    }
}

微调的实施路径（LoRA方案）

/**
 * 用LoRA微调的完整流程（Java调用Python训练脚本的工程化方案）
 * 
 * 注意：微调训练本身用Python（PyTorch/transformers生态），
 * Java这里做的是工程编排：数据准备、任务提交、结果评估
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class LoraFineTuningOrchestrator {
    
    /**
     * 微调数据格式化
     * 把业务数据转换成微调所需的Instruction格式
     */
    public List<InstructionSample> prepareTrainingData(
            List<QaPair> rawData) {
        
        return rawData.stream()
            .map(qa -> InstructionSample.builder()
                .instruction(qa.getInstruction())  // 任务说明
                .input(qa.getInput())               // 用户输入
                .output(qa.getExpectedOutput())     // 期望输出
                .build())
            .toList();
    }
    
    /**
     * 将训练数据写成JSONL格式（微调标准输入格式）
     */
    public void writeTrainingDataset(
            List<InstructionSample> samples, 
            Path outputPath,
            double trainTestSplit) throws IOException {
        
        // 打乱并分割
        List<InstructionSample> shuffled = new ArrayList<>(samples);
        Collections.shuffle(shuffled, new Random(42));
        
        int trainSize = (int)(shuffled.size() * trainTestSplit);
        List<InstructionSample> trainSet = shuffled.subList(0, trainSize);
        List<InstructionSample> testSet = shuffled.subList(trainSize, shuffled.size());
        
        // 写训练集
        ObjectMapper mapper = new ObjectMapper();
        try (BufferedWriter writer = Files.newBufferedWriter(
                outputPath.resolve("train.jsonl"))) {
            for (InstructionSample sample : trainSet) {
                writer.write(mapper.writeValueAsString(sample));
                writer.newLine();
            }
        }
        
        // 写验证集
        try (BufferedWriter writer = Files.newBufferedWriter(
                outputPath.resolve("test.jsonl"))) {
            for (InstructionSample sample : testSet) {
                writer.write(mapper.writeValueAsString(sample));
                writer.newLine();
            }
        }
        
        log.info("训练数据已写入: 训练集={}, 测试集={}", trainSet.size(), testSet.size());
    }
    
    /**
     * 调用Python训练脚本（实际的LoRA训练在Python中执行）
     */
    public FineTuningJob submitTrainingJob(FineTuningJobConfig config) throws IOException {
        // 生成训练配置文件
        String configJson = new ObjectMapper().writeValueAsString(Map.of(
            "base_model", config.getBaseModel(),  // "Qwen/Qwen2.5-7B-Instruct"
            "data_path", config.getDataPath(),
            "output_path", config.getOutputPath(),
            "lora_r", 8,        // LoRA rank（越大效果越好但越慢）
            "lora_alpha", 16,   // LoRA alpha
            "num_epochs", config.getEpochs(),
            "batch_size", 4,
            "learning_rate", 2e-4,
            "max_seq_length", 2048
        ));
        
        Path configPath = config.getOutputPath().resolve("train_config.json");
        Files.writeString(configPath, configJson);
        
        // 提交到训练集群（这里用命令行示意）
        ProcessBuilder pb = new ProcessBuilder(
            "python3", "train_lora.py",
            "--config", configPath.toString()
        );
        pb.redirectErrorStream(true);
        
        Process process = pb.start();
        String jobId = "job-" + System.currentTimeMillis();
        
        log.info("LoRA训练任务已提交: jobId={}", jobId);
        
        return new FineTuningJob(jobId, process, config.getOutputPath());
    }
    
    /**
     * 评估微调效果
     */
    public FineTuningEvaluationReport evaluate(
            Path modelPath,
            List<QaPair> evaluationSet,
            ChatLanguageModel baseModel) {
        
        // 用微调后的模型回答评估集问题
        // 与基础模型的回答做对比
        // 计算关键指标的改善程度
        
        // 实际实现需要加载微调后的模型（通常还是调Python）
        // 这里只是框架示意
        
        return new FineTuningEvaluationReport(
            evaluationSet.size(),
            0.85,  // 假设85%的准确率
            "微调后在目标任务上表现提升明显"
        );
    }
    
    @Builder
    public record InstructionSample(String instruction, String input, String output) {}
    
    public record FineTuningJob(String jobId, Process process, Path outputPath) {}
    
    public record FineTuningEvaluationReport(
        int evaluationSize, double accuracy, String summary
    ) {}
}

实践建议

不要急于微调的三个场景：

问题根源在数据质量：如果RAG的知识库质量差（文档过时、格式杂乱），微调也解决不了根本问题，先清洗数据。
样本量不足：LoRA微调通常需要500-2000个高质量样本，更复杂的任务需要更多。少于200个样本的微调效果很不稳定。
团队没有ML工程能力：微调不是"调个API"，需要GPU资源、训练基础设施、模型评估能力。评估自己的团队能力再做决定。

推荐的渐进式路径：

第一周：Prompt工程（零成本，快速验证方向）
  ↓ 不够好
第二周：RAG（构建知识库，通常能解决大多数知识问题）
  ↓ 不够好
一到三个月：积累高质量样本，考虑微调
  ↓
选择合适的基础模型（7B-13B的开源模型），用LoRA微调

90%的领域适配需求，用好Prompt + RAG就能解决。剩下的10%才需要微调。