第2156篇:MLflow在LLM项目中的实战——模型版本管理与实验追踪
2026/4/30大约 6 分钟
第2156篇:MLflow在LLM项目中的实战——模型版本管理与实验追踪
适读人群:需要管理LLM实验和版本的ML工程师 | 阅读时长:约18分钟 | 核心价值:用MLflow解决LLM项目中实验乱、版本乱、结果不可复现的工程问题
我们团队有一段时间处于"实验地狱"状态:
- 小明改了Prompt,测试效果不错,但没记录用的哪个模型版本
- 小李在自己电脑上跑了一组实验,结果存在本地Excel,换人就找不到
- 上周哪个版本效果最好?不知道,得重新跑
这些问题在传统ML项目里也有,MLflow是解决这类问题的标准工具。但LLM项目有自己的特殊性:Prompt是超参数,输入输出是非结构化文本,模型本身往往是外部API(不是你训练的)。
这篇文章讲怎么把MLflow应用到LLM项目,解决上面这些具体问题。
MLflow核心概念在LLM场景下的映射
传统ML → LLM场景映射
--- ---
实验(Experiment) → 一个评估任务(如"客服RAG效果测试")
运行(Run) → 一次具体的测试(特定Prompt版本+模型+数据集)
参数(Parameters) → Prompt内容、模型名称、Temperature等配置
指标(Metrics) → 准确率、Faithfulness、通过率等评估分数
制品(Artifacts) → 测试集、Prompt文件、评估报告
模型(Model) → LLM应用的完整配置(Prompt+模型+后处理)Java集成MLflow
MLflow主要是Python生态,但它提供了REST API,Java可以通过HTTP调用,也可以用官方Java客户端。
/**
* MLflow客户端封装
*
* 通过REST API与MLflow Server交互
*/
@Component
@RequiredArgsConstructor
@Slf4j
public class MlflowClient {
@Value("${mlflow.tracking-uri:http://localhost:5000}")
private String trackingUri;
private final RestTemplate restTemplate;
private final ObjectMapper objectMapper;
/**
* 创建或获取实验
*/
public String getOrCreateExperiment(String experimentName) {
// 先尝试获取
try {
String response = restTemplate.getForObject(
trackingUri + "/api/2.0/mlflow/experiments/get-by-name?experiment_name={name}",
String.class, experimentName
);
JsonNode node = objectMapper.readTree(response);
return node.path("experiment").path("experiment_id").asText();
} catch (Exception e) {
// 不存在则创建
try {
Map<String, Object> body = Map.of("name", experimentName);
String response = restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/experiments/create",
body, String.class
);
JsonNode node = objectMapper.readTree(response);
return node.path("experiment_id").asText();
} catch (Exception ex) {
throw new RuntimeException("创建MLflow实验失败", ex);
}
}
}
/**
* 开始一次Run
*/
public String startRun(String experimentId, String runName, Map<String, String> tags) {
Map<String, Object> body = new HashMap<>();
body.put("experiment_id", experimentId);
// 添加标签
List<Map<String, String>> tagList = new ArrayList<>();
tagList.add(Map.of("key", "mlflow.runName", "value", runName));
tags.forEach((k, v) -> tagList.add(Map.of("key", k, "value", v)));
body.put("tags", tagList);
try {
String response = restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/runs/create",
body, String.class
);
JsonNode node = objectMapper.readTree(response);
return node.path("run").path("info").path("run_id").asText();
} catch (Exception e) {
throw new RuntimeException("创建MLflow Run失败", e);
}
}
/**
* 记录参数
*/
public void logParam(String runId, String key, String value) {
Map<String, Object> body = Map.of("run_id", runId, "key", key, "value", value);
restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/runs/log-parameter",
body, String.class
);
}
/**
* 记录批量参数(更高效)
*/
public void logParams(String runId, Map<String, String> params) {
List<Map<String, String>> paramList = params.entrySet().stream()
.map(e -> Map.of("key", e.getKey(), "value", e.getValue()))
.collect(Collectors.toList());
Map<String, Object> body = Map.of("run_id", runId, "params", paramList);
restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/runs/log-batch",
body, String.class
);
}
/**
* 记录指标
*/
public void logMetric(String runId, String key, double value) {
Map<String, Object> body = Map.of(
"run_id", runId,
"key", key,
"value", value,
"timestamp", System.currentTimeMillis(),
"step", 0
);
restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/runs/log-metric",
body, String.class
);
}
/**
* 批量记录指标
*/
public void logMetrics(String runId, Map<String, Double> metrics) {
long timestamp = System.currentTimeMillis();
List<Map<String, Object>> metricList = metrics.entrySet().stream()
.map(e -> Map.of("key", e.getKey(), "value", e.getValue(),
"timestamp", timestamp, "step", 0))
.collect(Collectors.toList());
Map<String, Object> body = Map.of("run_id", runId, "metrics", metricList);
restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/runs/log-batch",
body, String.class
);
}
/**
* 结束Run
*/
public void endRun(String runId, boolean success) {
Map<String, Object> body = Map.of(
"run_id", runId,
"status", success ? "FINISHED" : "FAILED"
);
restTemplate.postForObject(
trackingUri + "/api/2.0/mlflow/runs/update",
body, String.class
);
}
/**
* 上传文件制品
*/
public void logArtifact(String runId, String localPath, String artifactPath) {
// MLflow的文件上传通过不同的端点
// 具体实现依赖artifact store的配置(本地/S3/GCS等)
log.info("上传制品: runId={}, localPath={}, artifactPath={}", runId, localPath, artifactPath);
// 实际实现需要根据artifact store类型调用相应API
}
}LLM实验追踪的完整示例
/**
* LLM评估实验的完整追踪流程
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmExperimentTracker {
private final MlflowClient mlflow;
private final LlmEvaluationService evaluationService;
private final PromptVersionManager promptManager;
/**
* 运行并追踪一次完整的评估实验
*
* @param config 实验配置
* @param testDataset 测试数据集
*/
public ExperimentResult runTrackedExperiment(ExperimentConfig config,
List<TestCase> testDataset) {
String experimentId = mlflow.getOrCreateExperiment(config.getExperimentName());
String runId = mlflow.startRun(experimentId, config.getRunName(), Map.of(
"team", config.getTeam(),
"purpose", config.getPurpose()
));
try {
// 1. 记录所有超参数
Map<String, String> params = new HashMap<>();
params.put("model_name", config.getModelName());
params.put("model_version", config.getModelVersion());
params.put("temperature", String.valueOf(config.getTemperature()));
params.put("max_tokens", String.valueOf(config.getMaxTokens()));
params.put("prompt_version", config.getPromptVersion());
params.put("prompt_hash", computePromptHash(config.getPromptContent()));
params.put("dataset_size", String.valueOf(testDataset.size()));
params.put("dataset_name", config.getDatasetName());
mlflow.logParams(runId, params);
// 2. 记录Prompt内容(作为制品)
String promptFile = savePromptToTempFile(config.getPromptContent());
mlflow.logArtifact(runId, promptFile, "prompts/prompt.txt");
// 3. 执行评估
List<EvaluationReport> reports = runEvaluations(config, testDataset);
// 4. 计算并记录汇总指标
Map<String, Double> metrics = computeAggregateMetrics(reports);
mlflow.logMetrics(runId, metrics);
// 5. 记录逐步指标(可以画时序图)
for (int i = 0; i < reports.size(); i++) {
EvaluationReport report = reports.get(i);
// 每隔10个记录一次滑动均值
if (i % 10 == 9) {
double rollingAvg = reports.subList(Math.max(0, i - 9), i + 1)
.stream().mapToDouble(EvaluationReport::getOverallScore).average().orElse(0);
Map<String, Object> stepMetric = Map.of(
"run_id", runId,
"key", "rolling_avg_score",
"value", rollingAvg,
"timestamp", System.currentTimeMillis(),
"step", i
);
// 记录step指标
}
}
// 6. 上传详细评估报告
String reportFile = saveReportsToFile(reports);
mlflow.logArtifact(runId, reportFile, "reports/evaluation_results.json");
// 7. 结束Run
mlflow.endRun(runId, true);
log.info("实验追踪完成: runId={}, 综合分={:.3f}", runId, metrics.get("overall_score"));
return ExperimentResult.builder()
.runId(runId)
.metrics(metrics)
.reports(reports)
.build();
} catch (Exception e) {
mlflow.endRun(runId, false);
throw new RuntimeException("实验执行失败", e);
}
}
private List<EvaluationReport> runEvaluations(ExperimentConfig config,
List<TestCase> testDataset) {
List<EvaluationReport> reports = new ArrayList<>();
for (TestCase tc : testDataset) {
// 执行LLM调用
String output = callLlm(config, tc.getQuestion());
// 评估
EvaluationReport report = evaluationService.evaluate(
EvaluationRequest.builder()
.userInput(tc.getQuestion())
.llmOutput(output)
.context(tc.getContext())
.build()
);
reports.add(report);
}
return reports;
}
private Map<String, Double> computeAggregateMetrics(List<EvaluationReport> reports) {
Map<String, Double> metrics = new HashMap<>();
metrics.put("overall_score", reports.stream()
.mapToDouble(EvaluationReport::getOverallScore).average().orElse(0));
metrics.put("pass_rate", reports.stream()
.mapToDouble(r -> r.isPassed() ? 1.0 : 0.0).average().orElse(0));
// 各维度均值
if (!reports.isEmpty() && reports.get(0).getDimensionScores() != null) {
for (String dim : reports.get(0).getDimensionScores().keySet()) {
double avg = reports.stream()
.map(r -> r.getDimensionScores().get(dim))
.filter(Objects::nonNull)
.mapToDouble(DimensionScore::getScore)
.average().orElse(0);
metrics.put("dim_" + dim, avg);
}
}
// P25/P50/P75分位数
List<Double> scores = reports.stream()
.map(EvaluationReport::getOverallScore)
.sorted()
.collect(Collectors.toList());
if (!scores.isEmpty()) {
metrics.put("score_p25", scores.get((int)(scores.size() * 0.25)));
metrics.put("score_p50", scores.get((int)(scores.size() * 0.50)));
metrics.put("score_p75", scores.get((int)(scores.size() * 0.75)));
}
return metrics;
}
private String computePromptHash(String prompt) {
try {
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] hash = md.digest(prompt.getBytes(StandardCharsets.UTF_8));
return Base64.getEncoder().encodeToString(hash).substring(0, 12);
} catch (Exception e) {
return "unknown";
}
}
private String savePromptToTempFile(String promptContent) throws IOException {
Path tempFile = Files.createTempFile("prompt_", ".txt");
Files.writeString(tempFile, promptContent);
return tempFile.toString();
}
private String saveReportsToFile(List<EvaluationReport> reports) throws IOException {
Path tempFile = Files.createTempFile("eval_reports_", ".json");
// 序列化报告
Files.writeString(tempFile, new ObjectMapper().writeValueAsString(reports));
return tempFile.toString();
}
private String callLlm(ExperimentConfig config, String question) {
// LLM调用实现
return ""; // placeholder
}
}实验对比和最优版本选择
/**
* 实验对比服务
*
* 从MLflow中查询多个Run,找出最优版本
*/
@Service
@RequiredArgsConstructor
public class ExperimentComparisonService {
private final MlflowClient mlflow;
/**
* 在某个实验下,找到最优的Run
*
* @param experimentName 实验名称
* @param primaryMetric 主要优化指标(如"overall_score")
* @param topK 返回前K个结果
*/
public List<RunSummary> findBestRuns(String experimentName,
String primaryMetric,
int topK) {
String url = mlflow.getTrackingUri() + "/api/2.0/mlflow/runs/search";
String experimentId = mlflow.getOrCreateExperiment(experimentName);
Map<String, Object> body = Map.of(
"experiment_ids", List.of(experimentId),
"filter", "attributes.status = 'FINISHED'",
"order_by", List.of("metrics." + primaryMetric + " DESC"),
"max_results", topK
);
// 调用API获取结果(省略HTTP调用细节)
// 返回RunSummary列表,包含runId、参数、指标等
return new ArrayList<>(); // placeholder
}
}工程经验
MLflow在LLM项目的几个最佳实践:
- 始终记录Prompt内容的哈希,方便后续追溯"这个Run用的哪个Prompt版本"
- 数据集也要版本化,用数据集名称+版本号作为参数记录,避免同名数据集不同内容
- 定期清理实验:MLflow的存储会增长,设置保留策略(比如只保留最近30天的Run)
- 用Run的Tag标注实验类型:
is_production_eval=true的Run用于生产决策,其他为探索性实验
