/**
 * 特征计算接口
 */
public interface FeatureExtractor<T> {

    /**
     * 特征名称（全局唯一）
     */
    String getFeatureName();

    /**
     * 特征类型
     */
    FeatureType getFeatureType();

    /**
     * 计算特征值
     */
    Double extract(T input);

    /**
     * 特征值是否有效（用于缺失值处理）
     */
    default boolean isValid(Double value) {
        return value != null && !Double.isNaN(value) && !Double.isInfinite(value);
    }
}

public enum FeatureType {
    NUMERICAL,      // 连续数值
    CATEGORICAL,    // 类别
    BINARY,         // 二值
    EMBEDDING,      // 向量（LLM 生成）
    SEQUENCE        // 时序
}

3.2 数值特征：统计聚合

最朴素但最有效的数值特征是统计聚合。以用户行为数据为例：

@Component
public class UserBehaviorFeatureExtractor {

    /**
     * 近 N 天的统计聚合特征
     */
    public Map<String, Double> extractTimeWindowFeatures(
            String userId, int windowDays, List<UserEvent> events) {

        LocalDateTime cutoff = LocalDateTime.now().minusDays(windowDays);
        List<UserEvent> windowEvents = events.stream()
            .filter(e -> e.getEventTime().isAfter(cutoff))
            .collect(Collectors.toList());

        Map<String, Double> features = new HashMap<>();

        // 基础统计
        features.put("event_count_" + windowDays + "d",
            (double) windowEvents.size());

        // 活跃天数
        long activeDays = windowEvents.stream()
            .map(e -> e.getEventTime().toLocalDate())
            .distinct()
            .count();
        features.put("active_days_" + windowDays + "d", (double) activeDays);

        // 活跃度（活跃天数/总天数）
        features.put("activity_ratio_" + windowDays + "d",
            (double) activeDays / windowDays);

        // 消费金额统计（如果有）
        DoubleSummaryStatistics amountStats = windowEvents.stream()
            .filter(e -> e.getAmount() != null)
            .mapToDouble(UserEvent::getAmount)
            .summaryStatistics();

        if (amountStats.getCount() > 0) {
            features.put("amount_sum_" + windowDays + "d", amountStats.getSum());
            features.put("amount_avg_" + windowDays + "d", amountStats.getAverage());
            features.put("amount_max_" + windowDays + "d", amountStats.getMax());
            features.put("amount_std_" + windowDays + "d",
                computeStd(windowEvents, amountStats.getAverage()));
        }

        // 事件类型分布
        Map<String, Long> eventTypeCounts = windowEvents.stream()
            .collect(Collectors.groupingBy(UserEvent::getEventType, Collectors.counting()));
        eventTypeCounts.forEach((type, count) ->
            features.put("event_type_" + type + "_" + windowDays + "d",
                (double) count / windowEvents.size()));

        return features;
    }

    private double computeStd(List<UserEvent> events, double mean) {
        if (events.size() < 2) return 0.0;
        double variance = events.stream()
            .filter(e -> e.getAmount() != null)
            .mapToDouble(e -> Math.pow(e.getAmount() - mean, 2))
            .average()
            .orElse(0);
        return Math.sqrt(variance);
    }
}

3.3 时序特征：捕捉行为模式

时序特征是传统 ML 的强项，LLM 很难从原始数值序列中学到这类模式：

@Component
public class TimeSeriesFeatureExtractor {

    /**
     * 提取时序特征：趋势、周期性、近期变化
     */
    public Map<String, Double> extractTsFeatures(List<Double> timeSeries) {
        Map<String, Double> features = new HashMap<>();
        int n = timeSeries.size();

        if (n < 3) {
            return features;
        }

        // 线性趋势（最小二乘斜率）
        features.put("ts_slope", computeSlope(timeSeries));

        // 近期值与历史均值的比值（衡量近期异常程度）
        double histMean = timeSeries.subList(0, n / 2).stream()
            .mapToDouble(Double::doubleValue).average().orElse(0);
        double recentMean = timeSeries.subList(n / 2, n).stream()
            .mapToDouble(Double::doubleValue).average().orElse(0);

        if (histMean != 0) {
            features.put("ts_recent_vs_hist_ratio", recentMean / histMean);
        }

        // 最近一期相对上一期的变化率
        double lastChange = (timeSeries.get(n - 1) - timeSeries.get(n - 2))
            / (timeSeries.get(n - 2) != 0 ? timeSeries.get(n - 2) : 1);
        features.put("ts_last_change_rate", lastChange);

        // 序列的变异系数（CV）——衡量波动性
        DoubleSummaryStatistics stats = timeSeries.stream()
            .mapToDouble(Double::doubleValue).summaryStatistics();
        if (stats.getAverage() != 0) {
            double std = Math.sqrt(timeSeries.stream()
                .mapToDouble(v -> Math.pow(v - stats.getAverage(), 2))
                .average().orElse(0));
            features.put("ts_cv", std / stats.getAverage());
        }

        // 序列中零值比例（比如连续未消费天数）
        long zeroCount = timeSeries.stream().filter(v -> v == 0).count();
        features.put("ts_zero_ratio", (double) zeroCount / n);

        return features;
    }

    private double computeSlope(List<Double> series) {
        int n = series.size();
        double sumX = 0, sumY = 0, sumXY = 0, sumX2 = 0;
        for (int i = 0; i < n; i++) {
            sumX += i;
            sumY += series.get(i);
            sumXY += i * series.get(i);
            sumX2 += i * i;
        }
        double denom = n * sumX2 - sumX * sumX;
        if (denom == 0) return 0;
        return (n * sumXY - sumX * sumY) / denom;
    }
}

3.4 LLM Embedding 特征的接入

现在说说怎么把 LLM 产生的 Embedding 整合进来：

@Service
public class EmbeddingFeatureService {

    @Autowired
    private EmbeddingApiClient embeddingClient;

    @Autowired
    private RedisTemplate<String, double[]> redisTemplate;

    private static final String EMBEDDING_CACHE_PREFIX = "embedding:";
    private static final Duration CACHE_TTL = Duration.ofHours(24);

    /**
     * 获取文本的 Embedding，带缓存
     */
    public double[] getEmbedding(String text) {
        String cacheKey = EMBEDDING_CACHE_PREFIX + DigestUtils.md5Hex(text);

        // 先查缓存
        double[] cached = redisTemplate.opsForValue().get(cacheKey);
        if (cached != null) {
            return cached;
        }

        // 调用 Embedding API（如 text-embedding-3-small）
        double[] embedding = embeddingClient.embed(text);

        // 写缓存
        redisTemplate.opsForValue().set(cacheKey, embedding, CACHE_TTL);

        return embedding;
    }

    /**
     * 降维：把高维 Embedding 压缩成低维特征，减少特征空间
     * 使用 PCA（这里用预训练好的 PCA 矩阵做线性变换）
     */
    public double[] reduceDimension(double[] embedding, double[][] pcaMatrix) {
        int targetDim = pcaMatrix.length;
        double[] reduced = new double[targetDim];
        for (int i = 0; i < targetDim; i++) {
            for (int j = 0; j < embedding.length; j++) {
                reduced[i] += pcaMatrix[i][j] * embedding[j];
            }
        }
        return reduced;
    }

    /**
     * 计算语义相似度特征：用户 query 与产品描述的余弦相似度
     */
    public double computeSemanticSimilarity(String text1, String text2) {
        double[] emb1 = getEmbedding(text1);
        double[] emb2 = getEmbedding(text2);
        return cosineSimilarity(emb1, emb2);
    }

    private double cosineSimilarity(double[] a, double[] b) {
        double dotProduct = 0, normA = 0, normB = 0;
        for (int i = 0; i < a.length; i++) {
            dotProduct += a[i] * b[i];
            normA += a[i] * a[i];
            normB += b[i] * b[i];
        }
        double denom = Math.sqrt(normA) * Math.sqrt(normB);
        return denom == 0 ? 0 : dotProduct / denom;
    }
}

3.5 特征融合：把所有特征汇总

@Service
public class FeatureFusionService {

    @Autowired
    private UserBehaviorFeatureExtractor behaviorExtractor;

    @Autowired
    private TimeSeriesFeatureExtractor tsExtractor;

    @Autowired
    private EmbeddingFeatureService embeddingService;

    /**
     * 构建完整特征向量，传统特征 + LLM 语义特征
     */
    public FeatureVector buildFeatureVector(UserContext context) {
        Map<String, Double> allFeatures = new LinkedHashMap<>();

        // 1. 行为统计特征（多个时间窗口）
        for (int window : new int[]{7, 14, 30, 90}) {
            Map<String, Double> windowFeatures = behaviorExtractor
                .extractTimeWindowFeatures(
                    context.getUserId(), window, context.getEvents());
            allFeatures.putAll(windowFeatures);
        }

        // 2. 时序特征（消费金额序列）
        if (context.getAmountSeries() != null) {
            Map<String, Double> tsFeatures = tsExtractor
                .extractTsFeatures(context.getAmountSeries());
            allFeatures.putAll(tsFeatures);
        }

        // 3. LLM 语义特征（如果有文本输入）
        if (context.getUserText() != null && !context.getUserText().isEmpty()) {
            // 降维后的 Embedding（比如 1536 维压缩到 32 维）
            double[] embedding = embeddingService.getEmbedding(context.getUserText());
            // 实际使用时应加载预训练的 PCA 矩阵
            for (int i = 0; i < Math.min(32, embedding.length); i++) {
                allFeatures.put("emb_dim_" + i, embedding[i]);
            }

            // 与业务标准描述的语义相似度
            for (Map.Entry<String, String> template :
                    context.getTemplateTexts().entrySet()) {
                double similarity = embeddingService.computeSemanticSimilarity(
                    context.getUserText(), template.getValue());
                allFeatures.put("semantic_sim_" + template.getKey(), similarity);
            }
        }

        // 4. 缺失值填充（中位数填充，生产中用离线计算好的统计值）
        allFeatures.replaceAll((k, v) -> (v == null || Double.isNaN(v)) ? 0.0 : v);

        return new FeatureVector(context.getUserId(), allFeatures);
    }
}

四、特征重要性分析与筛选

特征做了一堆，哪些真正有用？这一步非常关键，不做特征筛选会导致维度爆炸、过拟合、推理变慢。

@Service
public class FeatureImportanceAnalyzer {

    /**
     * 用 Permutation Importance 评估特征重要性
     * （模型无关的特征重要性评估方法）
     */
    public Map<String, Double> computePermutationImportance(
            List<FeatureVector> testSet,
            List<Double> trueLabels,
            Predictor model) {

        // 基准性能
        double baselineScore = evaluateAUC(testSet, trueLabels, model);

        Map<String, Double> importanceMap = new HashMap<>();
        List<String> featureNames = new ArrayList<>(testSet.get(0).getFeatures().keySet());

        for (String featureName : featureNames) {
            // 随机打乱这个特征的值
            List<FeatureVector> permutedSet = permuteFeature(testSet, featureName);
            double permutedScore = evaluateAUC(permutedSet, trueLabels, model);

            // 重要性 = 基准性能下降幅度
            importanceMap.put(featureName, baselineScore - permutedScore);
        }

        // 按重要性排序
        return importanceMap.entrySet().stream()
            .sorted(Map.Entry.<String, Double>comparingByValue().reversed())
            .collect(Collectors.toMap(
                Map.Entry::getKey,
                Map.Entry::getValue,
                (e1, e2) -> e1,
                LinkedHashMap::new));
    }

    private List<FeatureVector> permuteFeature(
            List<FeatureVector> data, String featureName) {
        List<Double> values = data.stream()
            .map(fv -> fv.getFeatures().get(featureName))
            .collect(Collectors.toList());
        Collections.shuffle(values);

        List<FeatureVector> permuted = new ArrayList<>();
        for (int i = 0; i < data.size(); i++) {
            Map<String, Double> newFeatures = new HashMap<>(data.get(i).getFeatures());
            newFeatures.put(featureName, values.get(i));
            permuted.add(new FeatureVector(data.get(i).getUserId(), newFeatures));
        }
        return permuted;
    }

    private double evaluateAUC(List<FeatureVector> testSet,
                                List<Double> trueLabels,
                                Predictor model) {
        // 简化实现，实际用 Apache Commons Math 的 ROC 计算
        List<Double> scores = testSet.stream()
            .map(model::predict)
            .collect(Collectors.toList());
        return computeAUC(scores, trueLabels);
    }

    private double computeAUC(List<Double> scores, List<Double> labels) {
        // 标准 AUC 计算（梯形积分法），这里略去细节
        return 0.0; // placeholder
    }
}

五、工程上的几点踩坑经验

坑一：训练时的特征分布和预测时不一致

最典型的问题：离线训练时用了"未来数据"构造特征（Data Leakage）。比如用全量数据算的归一化参数，但在训练集和测试集上分别算才是正确做法。解决方案是严格按时间切分，特征的统计量只能用训练集里的历史数据计算。

坑二：特征缓存失效导致线上线下不一致

在线服务的特征是从缓存里取的，离线训练时是重新计算的。如果缓存 TTL 设置不合理，线上的特征值可能是几天前的，和模型期望的完全不同。我们后来加了特征新鲜度监控，超过一定时间未更新的特征会触发告警。

坑三：LLM Embedding 维度太高直接拼接

早期我们把 1536 维的 Embedding 直接拼接到 50 维的结构化特征里，结果 XGBoost 几乎完全忽视了结构化特征，因为 Embedding 的维度太多导致"稀释"了结构化信号。后来统一做降维到 32 维再拼接，两类特征才能被模型公平对待。

坑四：特征版本管理

这个前面文章也提到过，特征版本是个容易被忽视的问题。线上跑的特征提取逻辑升级了，但旧模型依然在用旧特征空间。我们后来把特征 Schema（特征名称列表+类型）和模型绑定存储，每次特征 Schema 发生变化就必须重新训练模型。

六、小结

传统 ML 的特征工程在 LLM 时代没有过时，而是找到了更合适的位置——处理那些 LLM 真正不擅长的东西：精细的数值统计、时序模式、低延迟在线特征，以及业务专家知识的显式表达。

把两者的优势结合起来，才是企业级 AI 系统真正的工程之道。