第2228篇:多模态AI在教育领域的落地——图文题目的自动批改
2026/4/30大约 9 分钟
第2228篇:多模态AI在教育领域的落地——图文题目的自动批改
适读人群:做教育科技、智能批改系统的工程师 | 阅读时长:约16分钟 | 核心价值:构建支持图文题目的AI自动批改系统,处理数学公式、几何图形、手写答案等复杂场景
教育行业有个很有意思的特点:这里有最多的"手写图片 + 复杂结构化内容"。
一道数学几何题的答案,可能包含:手写的推导步骤、画出来的辅助线、几何证明、数值计算——每一个部分都需要理解,而且互相关联。
我们给一家K12教育公司做智能批改系统,初期只做了文字题目的批改,几何大题直接跳过让老师批。后来用户反馈,跳过的大题恰恰是最耗时的,老师每天批改几十份试卷,最累的就是那些有图的综合题。
这推动我们做了多模态批改,这篇文章把核心工程实现讲透。
教育场景多模态批改的挑战
批改系统架构
/**
* 智能批改系统架构核心
* 支持图文混合题目的多模态批改
*/
@Service
@Slf4j
public class AutoGradingOrchestrator {
@Autowired
private AnswerImagePreprocessor imagePreprocessor;
@Autowired
private HandwritingRecognizer handwritingRecognizer;
@Autowired
private MathFormulaProcessor mathFormulaProcessor;
@Autowired
private GeometryAnswerAnalyzer geometryAnalyzer;
@Autowired
private MultimodalGradingEngine gradingEngine;
@Autowired
private FeedbackGenerator feedbackGenerator;
/**
* 批改单题答案
*/
public GradingResult gradeAnswer(GradingRequest request) {
log.info("开始批改: questionId={}, studentId={}",
request.getQuestionId(), request.getStudentId());
Question question = request.getQuestion();
byte[] answerImageBytes = request.getAnswerImageBytes();
String answerText = request.getAnswerText(); // 可能为空(纯图片答案)
// 1. 图像预处理
ProcessedAnswer processedAnswer = imagePreprocessor.process(answerImageBytes);
// 2. 根据题目类型分路
GradingResult result = switch (question.getQuestionType()) {
case MULTIPLE_CHOICE -> gradeMultipleChoice(processedAnswer, question);
case FILL_IN_BLANK -> gradeFillInBlank(processedAnswer, answerText, question);
case SHORT_ANSWER -> gradeShortAnswer(processedAnswer, answerText, question);
case MATH_CALCULATION -> gradeMathCalculation(processedAnswer, question);
case GEOMETRY_PROOF -> gradeGeometryProof(processedAnswer, question);
case ESSAY -> gradeEssay(processedAnswer, answerText, question);
default -> gradingEngine.gradeWithMultimodal(processedAnswer, question);
};
// 3. 生成个性化反馈
String feedback = feedbackGenerator.generateFeedback(result, question,
request.getStudentProfile());
result.setFeedback(feedback);
log.info("批改完成: questionId={}, score={}/{}",
request.getQuestionId(), result.getScore(), result.getMaxScore());
return result;
}
/**
* 批改数学计算题
* 需要识别手写数字、运算步骤,并验证每步是否正确
*/
private GradingResult gradeMathCalculation(ProcessedAnswer processedAnswer,
Question question) {
// 1. 识别答案中的数学内容
MathContent mathContent = mathFormulaProcessor.extractMathContent(
processedAnswer.getProcessedImageBytes());
// 2. 解析解题步骤
List<SolutionStep> studentSteps = mathContent.getSolutionSteps();
// 3. 与标准解法对比(支持多种解法)
StepByStepGradingResult stepResult = gradeStepByStep(
studentSteps, question.getSolutionSteps());
// 4. 计算得分
int score = calculateStepScore(stepResult, question.getScoreBreakdown());
return GradingResult.builder()
.score(score)
.maxScore(question.getMaxScore())
.stepResults(stepResult.getStepResults())
.correctSteps(stepResult.getCorrectStepCount())
.totalSteps(studentSteps.size())
.build();
}
/**
* 批改几何证明题
* 最复杂的类型:需要理解图形、识别辅助线、验证逻辑链
*/
private GradingResult gradeGeometryProof(ProcessedAnswer processedAnswer,
Question question) {
// 用多模态模型综合理解几何答案
return gradingEngine.gradeGeometryWithMultimodal(
processedAnswer.getProcessedImageBytes(),
question);
}
}数学公式识别与处理
/**
* 数学公式和步骤识别处理器
* 将手写数学内容转化为可计算的结构化表示
*/
@Service
@Slf4j
public class MathFormulaProcessor {
@Autowired
private OpenAiClient openAiClient;
private static final String MATH_EXTRACTION_PROMPT = """
请识别并提取这张图片中的数学解题过程。
要求:
1. 识别每一个解题步骤(标注步骤编号)
2. 每步包含:
- 操作类型(等式变换/代入/化简/得出结论等)
- 数学表达式(用LaTeX格式)
- 结果或中间值
3. 识别最终答案
4. 如果有手写文字说明,也一并提取
输出JSON格式:
{
"steps": [
{
"stepNumber": 1,
"operationType": "设定变量",
"expression": "设 x = ...",
"latex": "x = ...",
"result": null,
"notes": "解题说明"
}
],
"finalAnswer": {
"value": "最终答案",
"latex": "LaTeX表示",
"unit": "单位(如有)"
},
"confidence": 0.9
}
""";
/**
* 从图片中提取数学解题内容
*/
public MathContent extractMathContent(byte[] imageBytes) {
String base64 = Base64.getEncoder().encodeToString(imageBytes);
String response = openAiClient.chatMultimodal(
MATH_EXTRACTION_PROMPT, base64, "image/jpeg",
ChatOptions.builder().temperature(0.0).maxTokens(2000).build());
try {
String cleaned = response.replaceAll("```json\\s*", "").replaceAll("```\\s*", "").trim();
JsonNode node = new ObjectMapper().readTree(cleaned);
List<SolutionStep> steps = new ArrayList<>();
JsonNode stepsNode = node.get("steps");
if (stepsNode != null) {
for (JsonNode stepNode : stepsNode) {
steps.add(SolutionStep.builder()
.stepNumber(stepNode.get("stepNumber").asInt())
.operationType(stepNode.get("operationType").asText())
.expression(stepNode.get("expression").asText(""))
.latex(stepNode.get("latex").asText(""))
.notes(stepNode.has("notes") ? stepNode.get("notes").asText(null) : null)
.build());
}
}
String finalAnswerStr = "";
JsonNode finalAnswerNode = node.get("finalAnswer");
if (finalAnswerNode != null) {
finalAnswerStr = finalAnswerNode.get("value").asText("");
}
return MathContent.builder()
.solutionSteps(steps)
.finalAnswer(finalAnswerStr)
.confidence(node.get("confidence").asDouble(0.8))
.build();
} catch (Exception e) {
log.error("数学内容提取失败", e);
return MathContent.empty();
}
}
/**
* 验证数学答案等价性
* "x=3" 和 "x=3.0" 和 "3" 在语境中可能都是对的
*/
public boolean areAnswersEquivalent(String studentAnswer, String correctAnswer,
AnswerContext context) {
// 1. 直接字符串比较(去除空格)
String normalizedStudent = normalizeAnswer(studentAnswer);
String normalizedCorrect = normalizeAnswer(correctAnswer);
if (normalizedStudent.equals(normalizedCorrect)) return true;
// 2. 数值等价(处理不同精度)
try {
double studentVal = Double.parseDouble(normalizedStudent.replace(",", ""));
double correctVal = Double.parseDouble(normalizedCorrect.replace(",", ""));
double tolerance = context.getTolerance() != null ? context.getTolerance() : 1e-6;
if (Math.abs(studentVal - correctVal) < tolerance) return true;
} catch (NumberFormatException ignored) {}
// 3. 代数等价(用LLM判断)
if (context.isAllowAlgebraicEquivalence()) {
return checkAlgebraicEquivalence(studentAnswer, correctAnswer, context);
}
return false;
}
/**
* 用LLM判断代数表达式的等价性
* 例如:2(x+3) = 2x+6 = 6+2x
*/
private boolean checkAlgebraicEquivalence(String studentAnswer, String correctAnswer,
AnswerContext context) {
String prompt = String.format("""
在%s的上下文中,判断以下两个数学表达式是否等价:
学生答案:%s
正确答案:%s
要求:
- 不考虑形式上的差异,只判断数学意义上是否等价
- 简单回答:等价 或 不等价
- 如果等价,说明原因(一句话)
""", context.getSubject(), studentAnswer, correctAnswer);
String result = openAiClient.chat(prompt,
ChatOptions.builder().temperature(0.0).maxTokens(100).build());
return result.contains("等价") && !result.contains("不等价");
}
private String normalizeAnswer(String answer) {
if (answer == null) return "";
return answer.trim()
.replaceAll("\\s+", "")
.toLowerCase()
.replace("(", "(")
.replace(")", ")");
}
}多模态批改引擎:几何题处理
/**
* 多模态批改引擎
* 对无法结构化处理的题目(几何证明、综合题)使用多模态模型直接批改
*/
@Service
@Slf4j
public class MultimodalGradingEngine {
@Autowired
private OpenAiClient openAiClient;
/**
* 几何题多模态批改
* 让多模态模型同时理解题目图和学生答案图
*/
public GradingResult gradeGeometryWithMultimodal(byte[] studentAnswerBytes,
Question question) {
String base64Answer = Base64.getEncoder().encodeToString(studentAnswerBytes);
String base64Question = Base64.getEncoder()
.encodeToString(question.getQuestionImageBytes());
// 构建评分细则
String scoringCriteria = buildScoringCriteria(question);
String gradingPrompt = String.format("""
你是一位专业的%s老师,正在批改学生的解题答案。
题目:%s
分值:%d分
评分细则:
%s
请按以下要求批改:
1. 识别学生的每个解题步骤
2. 按评分细则逐项打分
3. 指出错误所在(位置和原因)
4. 给出总分和简要评语
输出JSON格式:
{
"totalScore": 分数,
"maxScore": %d,
"stepGrades": [
{
"step": "步骤描述",
"scoreAwarded": 分数,
"maxScore": 分数,
"isCorrect": true/false,
"errorDescription": "错误说明(如有)"
}
],
"overallComment": "总体评语",
"mainErrors": ["主要错误1", "主要错误2"]
}
""",
question.getSubject(),
question.getText(),
question.getMaxScore(),
scoringCriteria,
question.getMaxScore());
// 同时传入题目图和答案图
String response;
if (question.getQuestionImageBytes() != null) {
response = openAiClient.chatMultipleImages(gradingPrompt,
Arrays.asList(
MultimodalImage.ofBase64(base64Question, "image/jpeg"),
MultimodalImage.ofBase64(base64Answer, "image/jpeg")
),
ChatOptions.builder().temperature(0.0).maxTokens(1500).build());
} else {
response = openAiClient.chatMultimodal(gradingPrompt, base64Answer, "image/jpeg",
ChatOptions.builder().temperature(0.0).maxTokens(1500).build());
}
return parseGradingResponse(response, question);
}
/**
* 通用多模态批改(适用于不规则题型)
*/
public GradingResult gradeWithMultimodal(ProcessedAnswer processedAnswer,
Question question) {
String base64 = Base64.getEncoder().encodeToString(
processedAnswer.getProcessedImageBytes());
String prompt = String.format("""
请批改以下学生答案。
题目:%s
满分:%d分
参考答案:%s
请评分并解释理由,输出JSON:
{
"score": 得分,
"maxScore": %d,
"isCorrect": true/false,
"feedback": "批改意见",
"keyPoints": {
"hit": ["答对的要点"],
"missed": ["漏答的要点"]
}
}
""",
question.getText(),
question.getMaxScore(),
question.getModelAnswer(),
question.getMaxScore());
String response = openAiClient.chatMultimodal(prompt, base64, "image/jpeg",
ChatOptions.builder().temperature(0.0).maxTokens(800).build());
return parseGradingResponse(response, question);
}
private GradingResult parseGradingResponse(String response, Question question) {
try {
String cleaned = response.replaceAll("```json\\s*", "").replaceAll("```\\s*", "").trim();
JsonNode node = new ObjectMapper().readTree(cleaned);
int score = node.has("totalScore") ?
node.get("totalScore").asInt() :
node.get("score").asInt(0);
List<StepGrade> stepGrades = new ArrayList<>();
JsonNode stepGradesNode = node.get("stepGrades");
if (stepGradesNode != null) {
for (JsonNode sg : stepGradesNode) {
stepGrades.add(StepGrade.builder()
.step(sg.get("step").asText())
.scoreAwarded(sg.get("scoreAwarded").asInt(0))
.maxScore(sg.get("maxScore").asInt(0))
.isCorrect(sg.get("isCorrect").asBoolean(false))
.errorDescription(sg.has("errorDescription") ?
sg.get("errorDescription").asText(null) : null)
.build());
}
}
return GradingResult.builder()
.score(score)
.maxScore(question.getMaxScore())
.stepGrades(stepGrades)
.overallComment(node.has("overallComment") ?
node.get("overallComment").asText("") : "")
.build();
} catch (Exception e) {
log.error("批改结果解析失败: response={}", response.substring(0, Math.min(200, response.length())), e);
return GradingResult.error(question.getMaxScore(), "批改结果解析失败");
}
}
private String buildScoringCriteria(Question question) {
if (question.getScoringCriteria() == null) {
return "根据题目要求,答案正确给满分,部分正确酌情给分";
}
return question.getScoringCriteria().stream()
.map(c -> "- " + c.getDescription() + "(" + c.getPoints() + "分)")
.collect(Collectors.joining("\n"));
}
}个性化反馈生成
/**
* 个性化批改反馈生成器
* 基于批改结果和学生画像生成有针对性的反馈
*/
@Service
@Slf4j
public class FeedbackGenerator {
@Autowired
private OpenAiClient openAiClient;
/**
* 生成个性化反馈
* 针对学生的错误类型和学习水平定制反馈
*/
public String generateFeedback(GradingResult result, Question question,
StudentProfile studentProfile) {
// 汇总错误信息
List<String> errors = result.getStepGrades().stream()
.filter(sg -> !sg.isCorrect() && sg.getErrorDescription() != null)
.map(StepGrade::getErrorDescription)
.collect(Collectors.toList());
if (errors.isEmpty() && result.getScore() == result.getMaxScore()) {
return generatePositiveFeedback(question, studentProfile);
}
String studentLevel = studentProfile != null ?
studentProfile.getLevel() : "中等水平";
String mainErrors = errors.isEmpty() ? result.getOverallComment() :
String.join("\n- ", errors);
String feedbackPrompt = String.format("""
请为一位%s的%s学生写批改反馈。
题目类型:%s
得分:%d/%d
主要错误:
- %s
要求:
1. 用鼓励性语气,先肯定做对的部分
2. 清楚指出错误所在(避免说"错了",而是说"这里可以这样理解...")
3. 给出明确的改进方向(下次怎么做)
4. 适合%s学生的语言难度
5. 100-150字,简洁但有用
""",
studentLevel,
question.getGrade(),
question.getQuestionType().getDisplayName(),
result.getScore(),
result.getMaxScore(),
mainErrors,
studentLevel);
return openAiClient.chat(feedbackPrompt,
ChatOptions.builder().temperature(0.5).maxTokens(300).build());
}
private String generatePositiveFeedback(Question question, StudentProfile profile) {
List<String> positiveComments = Arrays.asList(
"很好!解题过程清晰完整,逻辑严谨。",
"答题规范,思路正确,全部得分!",
"这道题回答得非常好,解题方法灵活准确。"
);
return positiveComments.get(new Random().nextInt(positiveComments.size()));
}
}批改准确率的评估与提升
/**
* 批改系统评估框架
* 持续监控批改准确率,对比人工批改基准
*/
@Service
@Slf4j
public class GradingAccuracyEvaluator {
@Autowired
private AutoGradingOrchestrator autograder;
/**
* 与人工批改结果对比,评估系统准确率
*/
public GradingEvaluationReport evaluate(List<HumanGradedAnswer> humanGraded) {
int totalAnswers = humanGraded.size();
int exactMatchCount = 0; // AI得分 = 人工得分
int closeMatchCount = 0; // |AI得分 - 人工得分| <= 1
double totalScoreDiff = 0;
for (HumanGradedAnswer hga : humanGraded) {
GradingRequest request = buildGradingRequest(hga);
GradingResult aiResult = autograder.gradeAnswer(request);
int scoreDiff = Math.abs(aiResult.getScore() - hga.getHumanScore());
totalScoreDiff += scoreDiff;
if (scoreDiff == 0) exactMatchCount++;
if (scoreDiff <= 1) closeMatchCount++;
}
double exactMatchRate = (double) exactMatchCount / totalAnswers;
double closeMatchRate = (double) closeMatchCount / totalAnswers;
double avgScoreDiff = totalScoreDiff / totalAnswers;
log.info("批改准确率评估: exactMatch={:.1%}, closeMatch={:.1%}, avgDiff={}",
exactMatchRate, closeMatchRate, avgScoreDiff);
return GradingEvaluationReport.builder()
.totalAnswers(totalAnswers)
.exactMatchRate(exactMatchRate)
.closeMatchRate(closeMatchRate)
.averageScoreDifference(avgScoreDiff)
.build();
}
private GradingRequest buildGradingRequest(HumanGradedAnswer hga) {
return GradingRequest.builder()
.questionId(hga.getQuestionId())
.studentId(hga.getStudentId())
.question(hga.getQuestion())
.answerImageBytes(hga.getAnswerImageBytes())
.build();
}
}实践数据
我们系统上线后的核心数据:
- 选择填空题:AI准确率 98%,接近完美(这类题型规则明确)
- 数学计算题:AI准确率 87%(步骤分准确率),最终答案分准确率 94%
- 几何证明题:AI与人工评分的"相差1分内"吻合率 82%
- 作文题:与人工评分的皮尔逊相关系数 0.78(中等相关,这类题型主观性强)
关键发现:
- 数学题的主要错误在于手写数字识别混淆(如"7"被识别为"1"),预处理质量直接影响准确率。
- 几何题让模型同时看题目图和答案图,比只看答案图准确率高约15%。
- 对于复杂主观题,建议设计"AI初批 + 人工复核"流程,而非全自动批改。
- 学生年级越低(小学),手写越难识别,需要更多预处理。
