/**
 * 统一数据处理接口
 */
public interface DataProcessor<I, O> {

    /**
     * 数据源类型标识
     */
    DataSourceType getSourceType();

    /**
     * 处理单条数据
     */
    ProcessResult<O> process(I input, ProcessContext context);

    /**
     * 批量处理（默认实现：逐条调用 process）
     * 子类可以覆盖实现更高效的批量逻辑
     */
    default List<ProcessResult<O>> processBatch(List<I> inputs,
                                                  ProcessContext context) {
        return inputs.stream()
            .map(input -> process(input, context))
            .collect(Collectors.toList());
    }

    /**
     * 验证输入是否有效
     */
    default ValidationResult validate(I input) {
        return ValidationResult.valid();
    }
}

/**
 * 统一的特征记录（所有数据源处理后的共同输出格式）
 */
@Data
@Builder
public class UnifiedFeatureRecord {

    private String entityId;      // 实体 ID（如用户 ID、商品 ID）
    private String entityType;    // 实体类型
    private String dataSourceId;  // 来源数据源

    // 数值特征
    private Map<String, Double> numericalFeatures;

    // 类别特征（已经过编码）
    private Map<String, String> categoricalFeatures;

    // 文本特征（原始文本，待 Embedding）
    private Map<String, String> textFeatures;

    // 向量特征（已经过 Embedding）
    private Map<String, float[]> vectorFeatures;

    // 元数据
    private LocalDateTime dataTimestamp;  // 数据的业务时间
    private LocalDateTime processedAt;    // 处理时间
    private String processorVersion;      // 处理逻辑版本

    // 数据质量标记
    private Double qualityScore;
    private List<String> qualityWarnings;
}

2.2 结构化数据处理器

@Component
public class MySQLStructuredProcessor
    implements DataProcessor<Map<String, Object>, UnifiedFeatureRecord> {

    @Override
    public DataSourceType getSourceType() {
        return DataSourceType.MYSQL;
    }

    @Override
    public ProcessResult<UnifiedFeatureRecord> process(
            Map<String, Object> rawRow, ProcessContext context) {

        UnifiedFeatureRecord.Builder builder = UnifiedFeatureRecord.builder()
            .dataSourceId(context.getDataSourceId())
            .processedAt(LocalDateTime.now())
            .processorVersion(getVersion());

        Map<String, Double> numerical = new HashMap<>();
        Map<String, String> categorical = new HashMap<>();
        List<String> warnings = new ArrayList<>();

        // 字段标准化映射（解决跨系统字段命名不一致问题）
        FieldMapping mapping = context.getFieldMapping();

        for (Map.Entry<String, Object> entry : rawRow.entrySet()) {
            String rawField = entry.getKey();
            Object value = entry.getValue();

            // 应用字段映射
            String standardField = mapping.mapField(rawField);
            if (standardField == null) continue;  // 不在映射里的字段忽略

            if (value == null) {
                // 空值处理：用上下文指定的填充策略
                NullHandlingStrategy strategy = context.getNullStrategy(standardField);
                value = strategy.handle(standardField, value);
                if (value == null) {
                    warnings.add("字段为空: " + standardField);
                    continue;
                }
            }

            // 根据字段类型分发
            FieldTypeConfig typeConfig = context.getFieldType(standardField);
            if (typeConfig.isNumerical()) {
                try {
                    double numVal = parseNumeric(value, typeConfig);

                    // 异常值检测
                    if (typeConfig.hasRange()) {
                        if (numVal < typeConfig.getMin() || numVal > typeConfig.getMax()) {
                            warnings.add(String.format("字段 %s 值 %f 超出范围 [%f, %f]",
                                standardField, numVal,
                                typeConfig.getMin(), typeConfig.getMax()));
                            numVal = typeConfig.clamp(numVal);  // 截断到合法范围
                        }
                    }

                    numerical.put(standardField, numVal);
                } catch (NumberFormatException e) {
                    warnings.add("数值解析失败: " + standardField + "=" + value);
                }
            } else if (typeConfig.isCategorical()) {
                categorical.put(standardField, value.toString());
            } else if (typeConfig.isTimestamp()) {
                // 时间统一转换为 epoch milliseconds
                long epochMs = parseTimestamp(value, typeConfig.getTimeZone());
                numerical.put(standardField + "_epoch_ms", (double) epochMs);
            }
        }

        double qualityScore = 1.0 - (warnings.size() * 0.1);

        UnifiedFeatureRecord record = builder
            .numericalFeatures(numerical)
            .categoricalFeatures(categorical)
            .qualityScore(Math.max(0, qualityScore))
            .qualityWarnings(warnings)
            .build();

        return ProcessResult.success(record);
    }

    private double parseNumeric(Object value, FieldTypeConfig config) {
        if (value instanceof Number) return ((Number) value).doubleValue();
        return Double.parseDouble(value.toString()
            .replaceAll("[,¥$%]", "").trim());
    }
}

2.3 半结构化 JSON 数据处理器

@Component
public class JsonSemiStructuredProcessor
    implements DataProcessor<String, UnifiedFeatureRecord> {

    @Autowired
    private ObjectMapper objectMapper;

    @Override
    public DataSourceType getSourceType() {
        return DataSourceType.JSON;
    }

    @Override
    public ProcessResult<UnifiedFeatureRecord> process(
            String jsonString, ProcessContext context) {

        JsonNode root;
        try {
            root = objectMapper.readTree(jsonString);
        } catch (JsonProcessingException e) {
            return ProcessResult.failure("JSON 解析失败: " + e.getMessage());
        }

        Map<String, Double> numerical = new HashMap<>();
        Map<String, String> categorical = new HashMap<>();
        Map<String, String> textFeatures = new HashMap<>();

        // 展平嵌套 JSON（点号分隔路径）
        flattenJson(root, "", numerical, categorical, textFeatures, context);

        UnifiedFeatureRecord record = UnifiedFeatureRecord.builder()
            .numericalFeatures(numerical)
            .categoricalFeatures(categorical)
            .textFeatures(textFeatures)
            .dataSourceId(context.getDataSourceId())
            .processedAt(LocalDateTime.now())
            .build();

        return ProcessResult.success(record);
    }

    /**
     * 递归展平 JSON，路径用点号连接
     * {"user": {"age": 25, "name": "张三"}}
     * -> user.age: 25.0, user.name: "张三"
     */
    private void flattenJson(JsonNode node, String prefix,
                              Map<String, Double> numerical,
                              Map<String, String> categorical,
                              Map<String, String> textFeatures,
                              ProcessContext context) {
        if (node.isObject()) {
            node.fields().forEachRemaining(entry -> {
                String path = prefix.isEmpty() ?
                    entry.getKey() : prefix + "." + entry.getKey();
                flattenJson(entry.getValue(), path, numerical,
                    categorical, textFeatures, context);
            });
        } else if (node.isArray()) {
            // 数组：计算长度和一些统计特征
            numerical.put(prefix + ".array_length", (double) node.size());

            // 如果是数值数组，计算统计量
            if (node.size() > 0 && node.get(0).isNumber()) {
                List<Double> values = new ArrayList<>();
                node.forEach(n -> values.add(n.doubleValue()));
                numerical.put(prefix + ".sum",
                    values.stream().mapToDouble(Double::doubleValue).sum());
                numerical.put(prefix + ".avg",
                    values.stream().mapToDouble(Double::doubleValue).average().orElse(0));
                numerical.put(prefix + ".max",
                    values.stream().mapToDouble(Double::doubleValue).max().orElse(0));
            }
        } else if (node.isNumber()) {
            numerical.put(prefix, node.doubleValue());
        } else if (node.isBoolean()) {
            numerical.put(prefix, node.booleanValue() ? 1.0 : 0.0);
        } else if (node.isTextual()) {
            String text = node.asText();
            // 根据配置决定是作为类别特征还是文本特征
            if (context.isTextField(prefix)) {
                textFeatures.put(prefix, text);
            } else {
                categorical.put(prefix, text);
            }
        }
    }
}

2.4 非结构化文本处理器

@Component
public class TextUnstructuredProcessor
    implements DataProcessor<TextDocument, UnifiedFeatureRecord> {

    @Autowired
    private NLPPipeline nlpPipeline;

    @Autowired
    private EmbeddingService embeddingService;

    @Override
    public DataSourceType getSourceType() {
        return DataSourceType.TEXT;
    }

    @Override
    public ProcessResult<UnifiedFeatureRecord> process(
            TextDocument doc, ProcessContext context) {

        String content = doc.getContent();
        if (content == null || content.trim().isEmpty()) {
            return ProcessResult.failure("文档内容为空");
        }

        Map<String, Double> numerical = new HashMap<>();
        Map<String, String> textFeatures = new HashMap<>();
        Map<String, float[]> vectorFeatures = new HashMap<>();

        // Step 1: 基础文本统计特征
        numerical.put("text_length", (double) content.length());
        numerical.put("word_count", (double) countWords(content));
        numerical.put("sentence_count", (double) countSentences(content));
        numerical.put("avg_sentence_length",
            numerical.get("text_length") / numerical.get("sentence_count"));
        numerical.put("chinese_ratio", calculateChineseRatio(content));

        // Step 2: NLP 特征提取
        NLPResult nlpResult = nlpPipeline.analyze(content);

        // 情感倾向
        numerical.put("sentiment_score", nlpResult.getSentimentScore());
        numerical.put("sentiment_positive", nlpResult.getPositiveScore());
        numerical.put("sentiment_negative", nlpResult.getNegativeScore());

        // 关键词
        List<String> keywords = nlpResult.getKeywords();
        textFeatures.put("keywords",
            String.join(",", keywords.subList(0, Math.min(10, keywords.size()))));

        // 命名实体
        if (!nlpResult.getPersons().isEmpty()) {
            numerical.put("person_entity_count",
                (double) nlpResult.getPersons().size());
        }
        if (!nlpResult.getOrganizations().isEmpty()) {
            numerical.put("org_entity_count",
                (double) nlpResult.getOrganizations().size());
        }

        // Step 3: 生成 Embedding 向量（异步，避免阻塞）
        if (context.isEmbeddingEnabled()) {
            float[] embedding = embeddingService.embed(content);
            vectorFeatures.put("content_embedding", embedding);

            // 也存原始文本，供后续重新 Embedding
            textFeatures.put("raw_content", content);
        }

        UnifiedFeatureRecord record = UnifiedFeatureRecord.builder()
            .entityId(doc.getDocId())
            .entityType("DOCUMENT")
            .numericalFeatures(numerical)
            .textFeatures(textFeatures)
            .vectorFeatures(vectorFeatures)
            .dataTimestamp(doc.getCreatedAt())
            .processedAt(LocalDateTime.now())
            .build();

        return ProcessResult.success(record);
    }
}

三、多源数据的融合拼接

3.1 实体对齐问题

多个数据源处理后，需要把同一个实体（同一个用户、同一个商品）的特征合并起来。关键是实体 ID 的对齐。

不同系统对同一个实体可能有不同的 ID，需要一个 ID 映射表：

@Service
public class EntityAlignmentService {

    @Autowired
    private EntityIdMappingRepository mappingRepository;

    /**
     * 将不同系统的 ID 统一到全局 ID
     */
    public String resolveGlobalId(String systemId, String sourceSystem) {
        return mappingRepository
            .findGlobalId(sourceSystem, systemId)
            .orElseGet(() -> {
                // 新 ID，注册并分配全局 ID
                String globalId = UUID.randomUUID().toString();
                EntityIdMapping mapping = new EntityIdMapping();
                mapping.setGlobalId(globalId);
                mapping.setSourceSystem(sourceSystem);
                mapping.setSourceId(systemId);
                mapping.setCreatedAt(LocalDateTime.now());
                mappingRepository.save(mapping);
                return globalId;
            });
    }

    /**
     * 合并来自多个数据源的特征记录
     */
    public UnifiedFeatureRecord mergeRecords(List<UnifiedFeatureRecord> records) {
        if (records.isEmpty()) return null;
        if (records.size() == 1) return records.get(0);

        String entityId = records.get(0).getEntityId();

        Map<String, Double> mergedNumerical = new HashMap<>();
        Map<String, String> mergedCategorical = new HashMap<>();
        Map<String, String> mergedText = new HashMap<>();
        Map<String, float[]> mergedVectors = new HashMap<>();
        List<String> allWarnings = new ArrayList<>();

        for (UnifiedFeatureRecord record : records) {
            // 数值特征：用数据源前缀区分（防止同名字段冲突）
            String prefix = record.getDataSourceId() + ".";
            if (record.getNumericalFeatures() != null) {
                record.getNumericalFeatures().forEach(
                    (k, v) -> mergedNumerical.put(prefix + k, v));
            }

            if (record.getCategoricalFeatures() != null) {
                record.getCategoricalFeatures().forEach(
                    (k, v) -> mergedCategorical.put(prefix + k, v));
            }

            if (record.getTextFeatures() != null) {
                record.getTextFeatures().forEach(
                    (k, v) -> mergedText.put(prefix + k, v));
            }

            if (record.getVectorFeatures() != null) {
                record.getVectorFeatures().forEach(
                    (k, v) -> mergedVectors.put(prefix + k, v));
            }

            if (record.getQualityWarnings() != null) {
                allWarnings.addAll(record.getQualityWarnings());
            }
        }

        // 计算综合质量分（取最低值，木桶效应）
        double minQuality = records.stream()
            .mapToDouble(r -> r.getQualityScore() != null ? r.getQualityScore() : 1.0)
            .min()
            .orElse(1.0);

        return UnifiedFeatureRecord.builder()
            .entityId(entityId)
            .numericalFeatures(mergedNumerical)
            .categoricalFeatures(mergedCategorical)
            .textFeatures(mergedText)
            .vectorFeatures(mergedVectors)
            .qualityScore(minQuality)
            .qualityWarnings(allWarnings)
            .processedAt(LocalDateTime.now())
            .build();
    }
}

3.2 时间对齐问题

不同数据源的数据不是同一时刻的——订单数据是实时的，用户画像是昨天更新的，知识库向量是上周建的。在融合时需要处理时间不一致的问题。

@Service
public class TemporalAlignmentService {

    /**
     * 时间对齐：获取指定时间点的特征快照
     * 用于保证训练样本的时间一致性（防止 Data Leakage）
     */
    public UnifiedFeatureRecord getPointInTimeSnapshot(
            String entityId, LocalDateTime targetTime) {

        List<UnifiedFeatureRecord> records = new ArrayList<>();

        // 从各数据源获取 targetTime 之前最新的特征
        for (DataSource dataSource : dataSourceRegistry.getAll()) {
            Optional<UnifiedFeatureRecord> record = dataSource
                .getLatestBefore(entityId, targetTime);
            record.ifPresent(records::add);
        }

        // 过滤掉时间差超过阈值的特征（数据太旧则不可信）
        records = records.stream()
            .filter(r -> r.getDataTimestamp() != null &&
                Duration.between(r.getDataTimestamp(), targetTime)
                    .toDays() <= MAX_FEATURE_AGE_DAYS)
            .collect(Collectors.toList());

        return entityAlignmentService.mergeRecords(records);
    }
}

四、大文件的流式处理

大型 CSV 文件、大 JSON 日志文件，不能全部加载到内存里处理，需要流式处理：

@Component
public class LargeFileStreamProcessor {

    @Autowired
    private StructuredDataProcessor structuredProcessor;

    /**
     * 流式处理大 CSV 文件（内存占用恒定）
     */
    public ProcessSummary processLargeCsv(Path filePath,
                                           ProcessContext context,
                                           FeatureWriter writer) {
        ProcessSummary summary = new ProcessSummary();
        int batchSize = 1000;
        List<Map<String, Object>> batch = new ArrayList<>();

        try (BufferedReader reader = Files.newBufferedReader(filePath,
                StandardCharsets.UTF_8)) {

            // 读取表头
            String headerLine = reader.readLine();
            String[] headers = headerLine.split(",");

            String line;
            while ((line = reader.readLine()) != null) {
                try {
                    Map<String, Object> row = parseCsvRow(line, headers);
                    batch.add(row);
                    summary.incrementTotal();

                    if (batch.size() >= batchSize) {
                        processBatchAndWrite(batch, context, writer, summary);
                        batch.clear();
                    }
                } catch (Exception e) {
                    summary.incrementParseError();
                    log.warn("CSV 行解析失败: {}", line);
                }
            }

            // 处理最后一批
            if (!batch.isEmpty()) {
                processBatchAndWrite(batch, context, writer, summary);
            }

        } catch (IOException e) {
            throw new DataProcessingException("文件读取失败: " + filePath, e);
        }

        return summary;
    }

    private void processBatchAndWrite(List<Map<String, Object>> batch,
                                       ProcessContext context,
                                       FeatureWriter writer,
                                       ProcessSummary summary) {
        List<ProcessResult<UnifiedFeatureRecord>> results =
            structuredProcessor.processBatch(batch, context);

        for (ProcessResult<UnifiedFeatureRecord> result : results) {
            if (result.isSuccess()) {
                writer.write(result.getData());
                summary.incrementSuccess();
            } else {
                summary.incrementFailure();
                log.warn("记录处理失败: {}", result.getErrorMessage());
            }
        }
    }
}

五、踩坑经验

坑一：JSON Schema 频繁变化导致下游崩溃

上游系统偷偷加了一个字段、改了一个字段名，下游的 JSON 解析直接报错。解决方案：用宽松解析策略（只取已知字段，未知字段忽略），同时建立 Schema 变更监控：对比相邻两批数据的字段集合，有新字段或消失字段时告警。

// Schema 漂移检测
public void detectSchemaDrift(String dataSourceId,
                               Set<String> currentFields) {
    Set<String> expectedFields = schemaRegistry.getExpectedFields(dataSourceId);
    Set<String> newFields = Sets.difference(currentFields, expectedFields);
    Set<String> missingFields = Sets.difference(expectedFields, currentFields);

    if (!newFields.isEmpty()) {
        alertService.sendAlert(AlertLevel.INFO,
            "数据源 " + dataSourceId + " 发现新字段: " + newFields);
    }
    if (!missingFields.isEmpty()) {
        alertService.sendAlert(AlertLevel.WARNING,
            "数据源 " + dataSourceId + " 字段消失: " + missingFields);
    }
}

坑二：实体对齐的 ID 映射表成为性能瓶颈

高并发下每次写特征都要查 ID 映射表，这张表成了热点。解决：本地缓存 + 异步写入映射表。查不到时先返回临时 ID，后台异步注册。

坑三：非结构化处理的 NLP 服务成本

每条文本都调用 NLP 服务（情感分析、关键词提取），在百万量级数据时成本极高。优化策略：

对于短文本（< 50 字），用规则替代模型
结果缓存（相同内容的文本只处理一次）
批量调用（每次发送 100 条，减少 API 调用次数）
对低价值字段（如用户名、商品编号）跳过 NLP 处理

坑四：时区问题导致特征错位

数据库存的是 UTC 时间，业务系统按北京时间展示，日志文件有的带时区有的不带。我们有一次特征里的"小时"特征完全错了，早高峰变成了凌晨峰。后来强制要求所有时间入库前统一转成 UTC epoch，展示层再做转换。

六、小结

多源异构数据融合的核心挑战不是技术难度，而是细节多、容错复杂。每个数据源都有自己的"脾气"：不规则的格式、偶发的空值、Schema 的无声变化。

处理好这些，靠的是：

统一的接口抽象（让上层不关心数据来自哪里）
充分的容错设计（单个数据源失败不影响整体）
完整的监控告警（数据格式变了第一时间发现）
清晰的时间语义（所有时间统一标准）

做到这四点，多源数据融合就算站稳了。