第2208篇:视频内容的AI分析——关键帧提取和内容摘要的工程实践
2026/4/30大约 6 分钟
第2208篇:视频内容的AI分析——关键帧提取和内容摘要的工程实践
适读人群:需要处理视频内容的Java工程师 | 阅读时长:约15分钟 | 核心价值:视频内容AI分析的完整工程方案,从关键帧提取到视频内容摘要生成
视频内容分析比图片分析多了一个维度:时间。
一个10分钟的视频,直接喂给VLM肯定不行——不可能把每帧都发过去。工程上需要解决的核心问题是:怎么从视频里提取最有代表性的帧,然后让AI理解这些帧组合起来的语义。
我做过几个视频分析项目:短视频的内容标签提取、教育视频的自动章节划分、监控视频的事件检测。这几个场景的共性技术是关键帧提取,差异在后续的分析逻辑。
这篇文章重点讲关键帧提取和内容摘要这两个最通用的能力。
一、关键帧提取的几种策略
策略1:固定时间间隔采样
最简单,每隔N秒取一帧。适合内容均匀的视频(如演讲录像),不适合内容变化快的视频。
策略2:基于场景变化检测
计算相邻帧的像素差异,差异超过阈值时认为场景发生了变化,在变化点提取关键帧。适合场景切换明显的视频(如新闻、剪辑视频)。
策略3:基于运动检测
用光流算法检测运动量,在运动变化点提取关键帧。适合监控视频、体育视频等运动相关场景。
策略4:基于内容多样性(最实用)
提取候选帧后,用图像相似度算法(感知哈希或CLIP向量)去除重复帧,保留内容多样的帧集合。
@Component
public class KeyFrameExtractor {
private static final Logger log = LoggerFactory.getLogger(KeyFrameExtractor.class);
/**
* 从视频中提取关键帧
* 使用场景变化检测 + 内容多样性过滤
*/
public List<KeyFrame> extractKeyFrames(String videoPath, ExtractionConfig config) {
List<KeyFrame> keyFrames = new ArrayList<>();
VideoCapture capture = new VideoCapture(videoPath);
if (!capture.isOpened()) {
throw new VideoProcessingException("无法打开视频文件: " + videoPath);
}
try {
double fps = capture.get(Videoio.CAP_PROP_FPS);
int totalFrames = (int) capture.get(Videoio.CAP_PROP_FRAME_COUNT);
double durationSeconds = totalFrames / fps;
log.info("视频信息: FPS={}, 总帧数={}, 时长={}秒", fps, totalFrames, durationSeconds);
Mat previousFrame = null;
Mat currentFrame = new Mat();
int frameIndex = 0;
// 按配置的间隔读取帧
int sampleInterval = (int) (fps * config.sampleIntervalSeconds());
while (capture.read(currentFrame)) {
if (frameIndex % sampleInterval == 0) {
double timestamp = frameIndex / fps;
// 检测场景变化
if (previousFrame == null ||
isSceneChange(previousFrame, currentFrame, config.sceneChangeThreshold())) {
// 转为字节数组
MatOfByte mob = new MatOfByte();
Imgcodecs.imencode(".jpg", currentFrame, mob,
new MatOfInt(Imgcodecs.IMWRITE_JPEG_QUALITY, 85));
keyFrames.add(new KeyFrame(
frameIndex, timestamp, mob.toArray(),
calculatePerceptualHash(currentFrame)));
previousFrame = currentFrame.clone();
}
}
frameIndex++;
// 最多提取config.maxFrames帧
if (keyFrames.size() >= config.maxFrames()) break;
}
// 内容多样性过滤:去除相似度过高的帧
return deduplicateByContent(keyFrames, config.similarityThreshold());
} finally {
capture.release();
}
}
/**
* 检测场景变化(基于帧差异)
*/
private boolean isSceneChange(Mat frame1, Mat frame2, double threshold) {
Mat gray1 = new Mat();
Mat gray2 = new Mat();
Imgproc.cvtColor(frame1, gray1, Imgproc.COLOR_BGR2GRAY);
Imgproc.cvtColor(frame2, gray2, Imgproc.COLOR_BGR2GRAY);
// 计算帧差
Mat diff = new Mat();
Core.absdiff(gray1, gray2, diff);
// 计算均方差
MatOfDouble mean = new MatOfDouble();
MatOfDouble stddev = new MatOfDouble();
Core.meanStdDev(diff, mean, stddev);
double meanDiff = mean.toArray()[0];
return meanDiff > threshold; // threshold通常设为15-25
}
/**
* 计算感知哈希(用于内容相似性判断)
*/
private String calculatePerceptualHash(Mat frame) {
// 缩放为8x8
Mat small = new Mat();
Imgproc.resize(frame, small, new Size(8, 8));
// 转灰度
Mat gray = new Mat();
Imgproc.cvtColor(small, gray, Imgproc.COLOR_BGR2GRAY);
// 计算平均值
Scalar mean = Core.mean(gray);
double avgValue = mean.val[0];
// 生成哈希字符串(每个像素大于平均值为1,否则为0)
StringBuilder hash = new StringBuilder();
for (int r = 0; r < gray.rows(); r++) {
for (int c = 0; c < gray.cols(); c++) {
hash.append(gray.get(r, c)[0] >= avgValue ? "1" : "0");
}
}
return hash.toString();
}
/**
* 计算两个感知哈希的汉明距离
*/
private int hammingDistance(String hash1, String hash2) {
int distance = 0;
for (int i = 0; i < Math.min(hash1.length(), hash2.length()); i++) {
if (hash1.charAt(i) != hash2.charAt(i)) distance++;
}
return distance;
}
/**
* 去除相似帧,保留内容多样的关键帧集合
*/
private List<KeyFrame> deduplicateByContent(List<KeyFrame> frames, double threshold) {
List<KeyFrame> unique = new ArrayList<>();
for (KeyFrame frame : frames) {
boolean isSimilarToExisting = unique.stream()
.anyMatch(existing -> {
int dist = hammingDistance(frame.hash(), existing.hash());
return dist < (int)(64 * (1 - threshold)); // 64是8x8哈希的总位数
});
if (!isSimilarToExisting) {
unique.add(frame);
}
}
return unique;
}
public record KeyFrame(int frameIndex, double timestamp, byte[] imageBytes, String hash) {
public String getTimestampString() {
int minutes = (int)(timestamp / 60);
int seconds = (int)(timestamp % 60);
return String.format("%02d:%02d", minutes, seconds);
}
}
public record ExtractionConfig(
double sampleIntervalSeconds, // 采样间隔(秒)
double sceneChangeThreshold, // 场景变化阈值
int maxFrames, // 最大帧数
double similarityThreshold // 相似度阈值(0-1)
) {
public static ExtractionConfig standard() {
return new ExtractionConfig(3.0, 20.0, 50, 0.9);
}
public static ExtractionConfig detailed() {
return new ExtractionConfig(1.0, 15.0, 100, 0.85);
}
}
}二、视频内容摘要生成
有了关键帧,用VLM做内容摘要:
@Service
public class VideoContentSummarizer {
private final VisionService visionService;
private final KeyFrameExtractor keyFrameExtractor;
/**
* 生成视频内容摘要
*/
public VideoSummary summarize(String videoPath, SummaryConfig config) {
// 1. 提取关键帧
List<KeyFrameExtractor.KeyFrame> keyFrames = keyFrameExtractor.extractKeyFrames(
videoPath, KeyFrameExtractor.ExtractionConfig.standard());
log.info("提取关键帧完成,共{}帧", keyFrames.size());
// 2. 如果帧数过多,按时间均匀采样减少帧数
if (keyFrames.size() > config.maxFramesForVLM()) {
keyFrames = uniformSample(keyFrames, config.maxFramesForVLM());
}
// 3. 构建多图VLM请求
List<ImageInput> images = keyFrames.stream()
.map(f -> ImageInput.fromBytes(f.imageBytes(), "image/jpeg"))
.collect(Collectors.toList());
// 构建时间戳提示
String timestampList = keyFrames.stream()
.map(f -> "[" + f.getTimestampString() + "]")
.collect(Collectors.joining(" "));
String prompt = String.format("""
以下%d张图片是一个视频的关键帧,对应时间点:%s
请根据这些关键帧,生成视频内容分析报告(JSON格式):
{
"title": "视频标题猜测",
"duration": "视频时长估计",
"mainTopics": ["主题1", "主题2"],
"summary": "100-200字的内容摘要",
"keyMoments": [
{
"timestamp": "时间点",
"description": "该时间点发生的内容"
}
],
"tags": ["内容标签列表"],
"category": "视频类别(教程/新闻/娱乐/体育/教育等)",
"targetAudience": "目标观众群体"
}
只返回JSON。
""", keyFrames.size(), timestampList);
VisionRequest request = VisionRequest.builder()
.images(images)
.prompt(prompt)
.build();
String response = visionService.analyzeImage(request).getContent();
return parseVideoSummary(response, keyFrames);
}
private List<KeyFrameExtractor.KeyFrame> uniformSample(
List<KeyFrameExtractor.KeyFrame> frames, int targetCount) {
if (frames.size() <= targetCount) return frames;
List<KeyFrameExtractor.KeyFrame> sampled = new ArrayList<>();
double step = (double) frames.size() / targetCount;
for (int i = 0; i < targetCount; i++) {
int index = (int) (i * step);
sampled.add(frames.get(Math.min(index, frames.size() - 1)));
}
return sampled;
}
private VideoSummary parseVideoSummary(String response,
List<KeyFrameExtractor.KeyFrame> keyFrames) {
try {
String cleanJson = response.replaceAll("```json\\s*", "")
.replaceAll("```\\s*", "").trim();
ObjectMapper mapper = new ObjectMapper();
JsonNode root = mapper.readTree(cleanJson);
List<String> topics = new ArrayList<>();
if (root.has("mainTopics")) root.get("mainTopics").forEach(t -> topics.add(t.asText()));
List<String> tags = new ArrayList<>();
if (root.has("tags")) root.get("tags").forEach(t -> tags.add(t.asText()));
return new VideoSummary(
root.has("title") ? root.get("title").asText() : "未知",
root.has("summary") ? root.get("summary").asText() : "",
topics, tags,
root.has("category") ? root.get("category").asText() : "未知",
keyFrames.size());
} catch (Exception e) {
return new VideoSummary("解析失败", "视频内容摘要生成失败",
List.of(), List.of(), "未知", keyFrames.size());
}
}
public record VideoSummary(String title, String summary, List<String> mainTopics,
List<String> tags, String category, int keyFrameCount) {}
public record SummaryConfig(int maxFramesForVLM) {
public static SummaryConfig standard() { return new SummaryConfig(12); }
}
}三、视频审核的特殊挑战
视频审核比图片审核复杂,因为违规内容可能只出现在视频的某几秒:
@Service
public class VideoContentModerator {
private final KeyFrameExtractor extractor;
private final ImageContentClassifier imageClassifier;
/**
* 视频内容审核:检测是否有违规帧
*/
public VideoModerationResult moderateVideo(String videoPath) {
// 用更密集的采样(每1秒一帧),确保不遗漏短暂出现的违规内容
List<KeyFrameExtractor.KeyFrame> frames = extractor.extractKeyFrames(videoPath,
new KeyFrameExtractor.ExtractionConfig(1.0, 5.0, 300, 0.95));
List<ViolationSegment> violations = new ArrayList<>();
double maxRiskScore = 0;
for (KeyFrameExtractor.KeyFrame frame : frames) {
ImageContentClassifier.ImageClassificationResult result =
imageClassifier.classify(frame.imageBytes());
double frameRiskScore = Math.max(result.pornScore(),
Math.max(result.violenceScore(), result.terrorismScore()));
maxRiskScore = Math.max(maxRiskScore, frameRiskScore);
if (frameRiskScore > 0.5) {
violations.add(new ViolationSegment(
frame.getTimestampString(),
result.getHighestRiskCategory(),
frameRiskScore));
}
}
String verdict = maxRiskScore > 0.8 ? "REJECT"
: maxRiskScore > 0.4 ? "MANUAL_REVIEW" : "PASS";
return new VideoModerationResult(verdict, maxRiskScore, violations, frames.size());
}
public record ViolationSegment(String timestamp, String violationType, double confidence) {}
public record VideoModerationResult(String verdict, double maxRiskScore,
List<ViolationSegment> violations, int framesAnalyzed) {}
}四、工程部署注意事项
视频处理的资源消耗
视频处理是计算密集型任务:
- 一个1小时视频(1080p, 30fps),帧提取约需30-60秒(CPU)
- 关键帧提取后通常得到50-200帧
- VLM分析200帧,按每次10帧计算,需要20次API调用
建议视频处理任务放在独立的Worker服务,使用消息队列削峰,不要和在线API服务混部。
大文件处理
对于几GB的大视频,不要把整个文件加载到内存。OpenCV的VideoCapture支持流式读取,每次只加载当前帧,内存占用可以控制在合理范围(通常<200MB)。
