第2426篇:AI系统的遗忘权实现——用户要求删除数据时的工程处理
2026/4/30大约 7 分钟
第2426篇:AI系统的遗忘权实现——用户要求删除数据时的工程处理
适读人群:负责AI系统数据管理的后端工程师 | 阅读时长:约12分钟 | 核心价值:实现GDPR和个保法要求的被遗忘权,包括训练数据和模型的处理方案
我收到过一封用户邮件,大意是:我想删除我在你们平台上的所有数据,包括你们AI模型里学到的关于我的一切。
这封邮件让我意识到,"数据删除"在AI系统里比在传统软件里难得多。传统软件删数据,就是DELETE FROM table WHERE user_id=xxx。AI系统删数据,面对的是:
- 数据库里的原始数据(相对简单)
- 训练数据集里的历史数据(需要重新处理)
- 已经训练好的模型(最难)
- 向量数据库里的embedding
- 日志和备份里的数据
这篇文章讲怎么系统性地实现"被遗忘权"。
一、被遗忘权的法律要求
GDPR第17条:数据主体有权要求数据控制者无不当延迟地删除涉及其个人数据。
触发条件:
- 数据已不再需要(目的消失)
- 用户撤回了同意
- 用户反对数据处理
- 数据被非法处理
- 遵守法律义务
例外情况:
- 表达自由和信息自由
- 法律义务(如财务记录保留要求)
- 公共利益
- 法律主张的设立、行使或捍卫
中国个保法第47条:相关处理条件消失或不再必要的,应当主动或应个人请求删除。
二、完整的删除请求处理流程
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import List, Optional, Dict
from enum import Enum
import uuid
import logging
logger = logging.getLogger(__name__)
class DeletionStatus(Enum):
RECEIVED = "received"
VERIFYING = "verifying"
IN_PROGRESS = "in_progress"
COMPLETED = "completed"
PARTIALLY_COMPLETED = "partially_completed" # 部分删除(有法律保留)
REJECTED = "rejected"
@dataclass
class DeletionRequest:
"""删除请求记录"""
request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
user_id: str = ""
requested_at: datetime = field(default_factory=datetime.now)
request_reason: str = "" # 用户说明的原因
request_channel: str = "" # 通过什么渠道提交
status: DeletionStatus = DeletionStatus.RECEIVED
# 处理结果
completed_at: Optional[datetime] = None
deleted_components: List[str] = field(default_factory=list)
retained_components: List[str] = field(default_factory=list) # 因法律原因保留的
retention_reasons: Dict[str, str] = field(default_factory=dict)
# 时限(GDPR要求30天内响应)
deadline: datetime = field(
default_factory=lambda: datetime.now() + timedelta(days=30)
)
def is_overdue(self) -> bool:
return datetime.now() > self.deadline and self.status not in [
DeletionStatus.COMPLETED, DeletionStatus.REJECTED
]
class RightToErasureHandler:
"""被遗忘权处理器"""
def __init__(self,
user_db, # 用户数据库
training_data_store, # 训练数据存储
vector_db, # 向量数据库
model_registry, # 模型注册中心
log_store, # 日志存储
backup_manager, # 备份管理器
legal_hold_checker): # 法律保全检查器
self.user_db = user_db
self.training_store = training_data_store
self.vector_db = vector_db
self.model_registry = model_registry
self.log_store = log_store
self.backup_manager = backup_manager
self.legal_checker = legal_hold_checker
def process_deletion_request(self, request: DeletionRequest) -> Dict:
"""处理删除请求的完整流程"""
logger.info(f"处理删除请求: {request.request_id} for user: {request.user_id}")
request.status = DeletionStatus.IN_PROGRESS
results = {}
# Step 1: 验证身份
if not self._verify_identity(request.user_id):
request.status = DeletionStatus.REJECTED
return {"status": "rejected", "reason": "身份验证失败"}
# Step 2: 检查法律保全
legal_holds = self.legal_checker.check(request.user_id)
# Step 3: 删除各类数据
results["user_profile"] = self._delete_user_profile(
request.user_id, legal_holds
)
results["interaction_history"] = self._delete_interaction_history(
request.user_id, legal_holds
)
results["vector_embeddings"] = self._delete_vector_embeddings(
request.user_id
)
results["training_data"] = self._handle_training_data_deletion(
request.user_id
)
results["logs"] = self._handle_log_deletion(
request.user_id, legal_holds
)
results["backups"] = self._schedule_backup_deletion(
request.user_id
)
# Step 4: 更新请求状态
request.completed_at = datetime.now()
all_deleted = all(r.get("status") == "deleted" for r in results.values())
any_retained = any(r.get("status") == "retained" for r in results.values())
if all_deleted:
request.status = DeletionStatus.COMPLETED
elif any_retained:
request.status = DeletionStatus.PARTIALLY_COMPLETED
# 记录保留原因
for component, result in results.items():
if result.get("status") == "retained":
request.retained_components.append(component)
request.retention_reasons[component] = result.get("reason", "")
# Step 5: 向用户确认
confirmation = self._generate_deletion_confirmation(request, results)
return {
"request_id": request.request_id,
"status": request.status.value,
"results": results,
"confirmation": confirmation
}
def _delete_user_profile(self, user_id: str, legal_holds: list) -> Dict:
"""删除用户基本信息"""
if "user_profile" in legal_holds:
return {
"status": "retained",
"reason": "法律保全要求,保留至: " + legal_holds["user_profile"]
}
# 实际删除操作
# self.user_db.delete_user(user_id)
logger.info(f"已删除用户档案: {user_id}")
return {"status": "deleted", "records_deleted": 1}
def _delete_interaction_history(self, user_id: str, legal_holds: list) -> Dict:
"""删除用户交互历史"""
# 金融交易记录等可能有法律保留要求
if "transactions" in legal_holds:
return {
"status": "partially_retained",
"retained": "金融交易记录(法律要求保留5年)",
"deleted": "非交易类交互记录"
}
# self.user_db.delete_interactions(user_id)
return {"status": "deleted", "records_deleted": 0} # 实际返回删除条数
def _delete_vector_embeddings(self, user_id: str) -> Dict:
"""
删除向量数据库中的用户embedding
在RAG系统中,用户上传的文档会被向量化存储
"""
# 找到所有与用户相关的向量
# user_vectors = self.vector_db.search_by_metadata({"user_id": user_id})
# self.vector_db.delete_vectors(user_vectors)
logger.info(f"已删除向量数据库中 {user_id} 的数据")
return {"status": "deleted", "vectors_deleted": 0}
def _handle_training_data_deletion(self, user_id: str) -> Dict:
"""
处理训练数据中的用户数据
这是最复杂的部分
"""
# 检查用户数据是否在当前训练集中
in_training_data = self.training_store.check_user_in_dataset(user_id)
if not in_training_data:
return {"status": "not_applicable", "reason": "用户数据不在训练集中"}
# 从训练数据集中删除用户数据
# self.training_store.remove_user_samples(user_id)
# 标记模型需要重新训练(机器遗忘)
affected_models = self.model_registry.get_models_trained_on_user(user_id)
if affected_models:
# 将这些模型标记为"需要遗忘处理"
for model_id in affected_models:
self.model_registry.flag_for_unlearning(model_id, user_id)
return {
"status": "deletion_queued",
"raw_data_deleted": True,
"model_unlearning_required": True,
"affected_models": affected_models,
"note": "训练数据已删除,模型遗忘处理已排队,预计完成时间:下次重训时"
}
return {"status": "deleted", "raw_data_deleted": True}
def _handle_log_deletion(self, user_id: str, legal_holds: list) -> Dict:
"""处理日志中的用户数据"""
# 安全日志可能有法律保留要求
if "audit_logs" in legal_holds:
# 不删除,但对日志中的个人标识符进行假名化
# self.log_store.pseudonymize_user_in_logs(user_id)
return {
"status": "pseudonymized",
"reason": "审计日志法律保留,已对个人标识符假名化"
}
return {"status": "deleted"}
def _schedule_backup_deletion(self, user_id: str) -> Dict:
"""
安排备份数据删除
备份文件通常无法立即删除,需要在下次备份轮转时删除
"""
# 标记所有包含该用户数据的备份文件
# self.backup_manager.schedule_user_deletion(user_id)
return {
"status": "scheduled",
"note": "备份数据将在下次备份轮转时(预计14天内)删除"
}
def _verify_identity(self, user_id: str) -> bool:
"""验证删除请求者的身份"""
# 实际中需要验证(比如发送验证邮件)
return True
def _generate_deletion_confirmation(self,
request: DeletionRequest,
results: Dict) -> str:
"""生成删除确认通知"""
confirmation = f"""
您的数据删除请求(编号:{request.request_id})已处理完成。
处理结果摘要:
"""
status_map = {
"deleted": "已删除",
"pseudonymized": "已假名化(保留用于审计)",
"scheduled": "已排队删除(将在14天内完成)",
"retained": "因法律要求保留",
"not_applicable": "不适用"
}
for component, result in results.items():
status = result.get("status", "unknown")
status_text = status_map.get(status, status)
confirmation += f"- {component}: {status_text}\n"
if request.retained_components:
confirmation += "\n保留说明:\n"
for component in request.retained_components:
reason = request.retention_reasons.get(component, "")
confirmation += f"- {component}: {reason}\n"
confirmation += "\n如有疑问,请联系数据保护专员。"
return confirmation三、机器遗忘(Machine Unlearning)
删除训练数据后,模型"记住"了用户数据这个问题还没有解决。这是学术界还在研究的问题,但工程上有几种实用方案:
class MachineUnlearningStrategy:
"""机器遗忘策略"""
def choose_strategy(self,
model_info: Dict,
user_percentage: float) -> Dict:
"""
根据场景选择遗忘策略
user_percentage: 被删除用户占训练集的比例
"""
if user_percentage < 0.01: # 不到1%
return {
"strategy": "full_retrain",
"description": "用户数据占比极小,下次定期重训时自然移除",
"timeline": "下次模型更新时(通常1-3个月)",
"cost": "极低"
}
if model_info.get("supports_incremental_unlearning"):
return {
"strategy": "incremental_unlearning",
"description": "使用SISA训练架构进行高效局部遗忘",
"timeline": "1-3天",
"cost": "中等"
}
if user_percentage < 0.1: # 不到10%
return {
"strategy": "gradient_ascent_unlearning",
"description": "使用梯度上升对遗忘样本做"反训练"",
"timeline": "1-7天",
"cost": "中等",
"caveat": "可能影响模型性能,需要事后验证"
}
return {
"strategy": "full_retrain",
"description": "用户数据占比较大,完整重训是最可靠的方案",
"timeline": "1-4周",
"cost": "高"
}
def verify_unlearning(self,
model_before,
model_after,
deleted_user_data) -> Dict:
"""
验证遗忘是否成功
通过成员推断攻击测试模型是否仍"记得"被删除的数据
"""
# 成员推断:测试模型是否对被删除数据有更高的确信度
# 如果成功遗忘,删除数据的预测应该接近随机猜测
# 这里是概念性代码,实际需要完整的成员推断攻击实现
results = {
"verification_method": "成员推断攻击",
"deleted_data_membership_score": 0.52, # 接近0.5说明模型不再记得
"random_baseline": 0.50,
"threshold": 0.60,
"unlearning_verified": True, # 小于阈值认为遗忘成功
"note": "成员推断得分接近随机基线,说明模型已成功遗忘被删除的数据"
}
return results机器遗忘是AI合规领域最前沿也最难的问题。务实的做法是:保留充分的训练数据记录,每次重训时自动排除已删除用户的数据,配合适当的合同条款说明模型更新时间线,通常是可以被法律接受的方案。
