第2060篇:知识图谱增强RAG——让AI理解实体关系
大约 6 分钟
第2060篇:知识图谱增强RAG——让AI理解实体关系
适读人群:需要在RAG中处理复杂实体关系的工程师 | 阅读时长:约19分钟 | 核心价值:理解知识图谱如何弥补向量RAG的关系推理缺陷,掌握Graph RAG的实现思路
纯向量RAG有一个做不好的问题:关系推理。
比如用户问:"哪些产品和我们A客户签约的张总有关联?"
这个问题需要:知道张总是A客户的CEO,张总签过哪些合同,合同对应哪些产品。这是一系列的实体-关系推理,不是语义相似度能解决的。
知识图谱RAG(Graph RAG)就是专门解决这类问题的。
向量RAG vs 知识图谱RAG
知识图谱的数据结构
/**
* 知识图谱的基本数据结构
*/
@Data
@Builder
public class KnowledgeGraph {
private final Map<String, Entity> entities = new HashMap<>();
private final List<Relation> relations = new ArrayList<>();
@Data
@Builder
public static class Entity {
private String id;
private String name;
private String type; // PERSON/COMPANY/PRODUCT/CONTRACT/etc
private Map<String, String> properties;
}
@Data
@Builder
public static class Relation {
private String fromEntityId;
private String toEntityId;
private String relationType; // BELONGS_TO/SIGNED_BY/ASSOCIATED_WITH/etc
private Map<String, String> properties;
private double confidence; // 关系置信度
}
/**
* 从实体出发,查找N跳以内的相关实体
*/
public List<Entity> findRelatedEntities(String entityId, int hops) {
Set<String> visited = new HashSet<>();
List<Entity> result = new ArrayList<>();
findRelatedEntitiesDFS(entityId, hops, visited, result);
return result;
}
private void findRelatedEntitiesDFS(
String entityId, int hopsLeft,
Set<String> visited, List<Entity> result) {
if (hopsLeft == 0 || visited.contains(entityId)) return;
visited.add(entityId);
Entity entity = entities.get(entityId);
if (entity != null) result.add(entity);
// 找所有与该实体相关的关系
relations.stream()
.filter(r -> r.getFromEntityId().equals(entityId) ||
r.getToEntityId().equals(entityId))
.forEach(r -> {
String nextId = r.getFromEntityId().equals(entityId) ?
r.getToEntityId() : r.getFromEntityId();
findRelatedEntitiesDFS(nextId, hopsLeft - 1, visited, result);
});
}
/**
* 查询特定关系类型的相关实体
*/
public List<Entity> findByRelationType(String entityId, String relationType) {
return relations.stream()
.filter(r -> r.getFromEntityId().equals(entityId) &&
r.getRelationType().equals(relationType))
.map(r -> entities.get(r.getToEntityId()))
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
}自动提取知识图谱
/**
* 用LLM从文档中自动提取知识图谱
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class KnowledgeGraphExtractor {
private final ChatLanguageModel llm;
private final ObjectMapper objectMapper;
/**
* 从文档中提取实体和关系
*/
public ExtractionResult extractFromDocument(String documentText) {
String extractionPrompt = String.format("""
从以下文本中提取实体和它们之间的关系。
实体类型:PERSON(人物)、COMPANY(公司)、PRODUCT(产品)、
CONTRACT(合同)、POSITION(职位)
关系类型:WORKS_FOR(在职)、SIGNED_BY(签署人)、BELONGS_TO(属于)、
ASSOCIATED_WITH(关联)、HAS_POSITION(担任职位)
文本:
%s
输出JSON格式:
{
"entities": [
{"id": "e1", "name": "张三", "type": "PERSON", "properties": {"department": "销售部"}}
],
"relations": [
{"from": "e1", "to": "e2", "type": "WORKS_FOR", "confidence": 0.95}
]
}
""", documentText);
String response = llm.generate(extractionPrompt);
return parseExtractionResult(response);
}
/**
* 批量处理文档,构建完整的知识图谱
*/
public KnowledgeGraph buildGraph(List<String> documents) {
KnowledgeGraph graph = new KnowledgeGraph();
for (int i = 0; i < documents.size(); i++) {
log.info("处理文档 {}/{}", i + 1, documents.size());
try {
ExtractionResult result = extractFromDocument(documents.get(i));
// 合并实体(相同名称的实体可能来自不同文档)
for (KnowledgeGraph.Entity entity : result.entities()) {
mergeEntity(graph, entity);
}
// 添加关系
graph.getRelations().addAll(result.relations());
} catch (Exception e) {
log.warn("文档{}提取失败: {}", i, e.getMessage());
}
}
log.info("知识图谱构建完成: {}个实体, {}条关系",
graph.getEntities().size(), graph.getRelations().size());
return graph;
}
private void mergeEntity(KnowledgeGraph graph, KnowledgeGraph.Entity newEntity) {
// 按名称查找已有实体
KnowledgeGraph.Entity existing = graph.getEntities().values().stream()
.filter(e -> e.getName().equals(newEntity.getName()) &&
e.getType().equals(newEntity.getType()))
.findFirst()
.orElse(null);
if (existing == null) {
graph.getEntities().put(newEntity.getId(), newEntity);
} else {
// 合并属性
existing.getProperties().putAll(newEntity.getProperties());
}
}
private ExtractionResult parseExtractionResult(String json) {
// 解析LLM返回的JSON
try {
JsonNode root = objectMapper.readTree(extractJson(json));
List<KnowledgeGraph.Entity> entities = new ArrayList<>();
List<KnowledgeGraph.Relation> relations = new ArrayList<>();
// 解析实体
for (JsonNode node : root.get("entities")) {
Map<String, String> props = new HashMap<>();
JsonNode propsNode = node.get("properties");
if (propsNode != null) {
propsNode.fields().forEachRemaining(e ->
props.put(e.getKey(), e.getValue().asText()));
}
entities.add(KnowledgeGraph.Entity.builder()
.id(node.get("id").asText())
.name(node.get("name").asText())
.type(node.get("type").asText())
.properties(props)
.build());
}
// 解析关系
for (JsonNode node : root.get("relations")) {
relations.add(KnowledgeGraph.Relation.builder()
.fromEntityId(node.get("from").asText())
.toEntityId(node.get("to").asText())
.relationType(node.get("type").asText())
.confidence(node.has("confidence") ? node.get("confidence").asDouble() : 0.8)
.build());
}
return new ExtractionResult(entities, relations);
} catch (Exception e) {
log.warn("解析提取结果失败: {}", e.getMessage());
return new ExtractionResult(List.of(), List.of());
}
}
private String extractJson(String text) {
// 提取文本中的JSON部分
int start = text.indexOf('{');
int end = text.lastIndexOf('}');
if (start >= 0 && end > start) {
return text.substring(start, end + 1);
}
return text;
}
public record ExtractionResult(
List<KnowledgeGraph.Entity> entities,
List<KnowledgeGraph.Relation> relations
) {}
}知识图谱检索
/**
* 知识图谱查询服务
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class GraphQueryService {
private final ChatLanguageModel llm;
private final KnowledgeGraph graph; // 预构建的知识图谱
/**
* 基于知识图谱回答问题
*/
public String answerWithGraph(String question) {
// 1. 从问题中提取关键实体
List<String> mentionedEntities = extractEntitiesFromQuestion(question);
log.info("问题中提及的实体: {}", mentionedEntities);
// 2. 在图谱中找到这些实体
List<KnowledgeGraph.Entity> foundEntities = new ArrayList<>();
for (String entityName : mentionedEntities) {
graph.getEntities().values().stream()
.filter(e -> e.getName().contains(entityName) ||
entityName.contains(e.getName()))
.forEach(foundEntities::add);
}
if (foundEntities.isEmpty()) {
return "未找到相关实体信息,请提供更多细节";
}
// 3. 扩展相关实体(2跳以内)
Set<String> expandedEntityIds = new HashSet<>();
for (KnowledgeGraph.Entity entity : foundEntities) {
List<KnowledgeGraph.Entity> related = graph.findRelatedEntities(entity.getId(), 2);
related.forEach(e -> expandedEntityIds.add(e.getId()));
}
// 4. 构建子图(相关的关系)
List<KnowledgeGraph.Relation> relevantRelations = graph.getRelations().stream()
.filter(r -> expandedEntityIds.contains(r.getFromEntityId()) &&
expandedEntityIds.contains(r.getToEntityId()))
.collect(Collectors.toList());
// 5. 把子图转为自然语言描述
String graphContext = buildGraphContext(expandedEntityIds, relevantRelations);
// 6. 用LLM基于图谱信息回答
return llm.generate(String.format("""
基于以下实体关系信息回答问题。
实体关系信息:
%s
问题:%s
""", graphContext, question));
}
private List<String> extractEntitiesFromQuestion(String question) {
String prompt = String.format("""
从以下问题中提取关键实体名称(人名、公司名、产品名等),每行一个:
问题:%s
实体列表:
""", question);
String response = llm.generate(prompt);
return Arrays.stream(response.split("\n"))
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toList());
}
private String buildGraphContext(
Set<String> entityIds,
List<KnowledgeGraph.Relation> relations) {
// 构建实体描述
String entitiesDesc = entityIds.stream()
.map(id -> graph.getEntities().get(id))
.filter(Objects::nonNull)
.map(e -> String.format("%s(%s): %s",
e.getName(), e.getType(),
e.getProperties().entrySet().stream()
.map(p -> p.getKey() + "=" + p.getValue())
.collect(Collectors.joining(", "))))
.collect(Collectors.joining("\n"));
// 构建关系描述
String relationsDesc = relations.stream()
.map(r -> {
KnowledgeGraph.Entity from = graph.getEntities().get(r.getFromEntityId());
KnowledgeGraph.Entity to = graph.getEntities().get(r.getToEntityId());
if (from == null || to == null) return null;
return String.format("%s -[%s]-> %s",
from.getName(), r.getRelationType(), to.getName());
})
.filter(Objects::nonNull)
.collect(Collectors.joining("\n"));
return "实体:\n" + entitiesDesc + "\n\n关系:\n" + relationsDesc;
}
}混合检索:图谱 + 向量
最强大的方案是把知识图谱和向量检索结合:
/**
* 混合知识检索:知识图谱 + 向量RAG
*/
@Service
@RequiredArgsConstructor
public class HybridKnowledgeRetriever {
private final GraphQueryService graphService;
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> vectorStore;
private final ChatLanguageModel llm;
public String answer(String question) {
// 并发执行两种检索
CompletableFuture<String> graphFuture = CompletableFuture
.supplyAsync(() -> getGraphContext(question));
CompletableFuture<String> vectorFuture = CompletableFuture
.supplyAsync(() -> getVectorContext(question));
String graphContext = graphFuture.join();
String vectorContext = vectorFuture.join();
// 组合两种信息源
return llm.generate(String.format("""
请综合以下两种信息来源回答问题。
实体关系信息(来自知识图谱):
%s
相关文档内容(来自知识库):
%s
问题:%s
""", graphContext, vectorContext, question));
}
private String getGraphContext(String question) {
try {
return graphService.answerWithGraph(question);
} catch (Exception e) {
return "知识图谱检索失败";
}
}
private String getVectorContext(String question) {
float[] queryEmb = embeddingModel.embed(question);
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(Embedding.from(queryEmb))
.maxResults(3)
.minScore(0.6)
.build();
return vectorStore.search(request).matches().stream()
.map(m -> m.embedded().text())
.collect(Collectors.joining("\n\n"));
}
}知识图谱RAG的主要成本在于图谱的构建和维护。如果你的业务场景有大量实体关系(人员组织、供应链、产品关联),这个投入是值得的。如果主要是文档问答,纯向量RAG就够了。
