第2028篇：LLM服务的多租户资源隔离——一台GPU服务多个业务线

老张2026/4/30大约 6 分钟

第2028篇：LLM服务的多租户资源隔离——一台GPU服务多个业务线

适读人群：需要在共享GPU资源上为多个业务线提供LLM服务的工程师 | 阅读时长：约17分钟 | 核心价值：设计公平、可控的多租户LLM服务，避免业务线之间互相影响

采购了一台A100 80GB，按说跑一个7B模型绰绰有余。但公司里有三个业务线——客服系统、营销系统、内部知识库——都要用LLM。

最简单的方案是共享：三个业务线都调用同一个LLM服务端点。

上线一个月后，营销系统搞了一个批量生成活动文案的功能，一次提交1000条任务，把GPU打满了，客服系统的实时对话全部超时。运营来投诉，说客服AI挂了。

这就是多租户资源隔离的必要性。

多租户的资源隔离层次

解决这个问题有几个层次，从简单到复杂：

大多数场景下，层次2（优先级队列）是最好的平衡点：成本不增加，同时保证关键业务的服务质量。

应用层限流实现

先实现最基础的每租户限流：

/**
 * 多租户LLM请求管理器
 * 每个租户（业务线）有独立的限流配置
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class TenantAwareLlmGateway {
    
    private final ChatClient llmClient;
    private final StringRedisTemplate redis;
    private final TenantConfigRepository tenantConfigRepo;
    
    /**
     * 带租户隔离的LLM调用
     */
    public LlmResponse call(String tenantId, ChatRequest request) {
        TenantConfig config = tenantConfigRepo.findByTenantId(tenantId);
        
        // 1. 限流检查
        if (!checkRateLimit(tenantId, config)) {
            throw new TenantRateLimitException(
                String.format("租户[%s]请求超限，当前配额: %d req/min，请降低请求频率", 
                            tenantId, config.getMaxRequestsPerMinute()));
        }
        
        // 2. 并发控制
        if (!acquireConcurrentSlot(tenantId, config)) {
            throw new TenantConcurrencyException(
                String.format("租户[%s]并发超限，当前最大并发: %d", 
                            tenantId, config.getMaxConcurrentRequests()));
        }
        
        long start = System.currentTimeMillis();
        
        try {
            String content = llmClient.prompt()
                .system(request.getSystemPrompt())
                .user(request.getUserMessage())
                .options(ChatOptions.builder()
                    .maxTokens(Math.min(request.getMaxTokens(), config.getMaxTokensPerRequest()))
                    .build())
                .call()
                .content();
            
            // 3. 记录使用量（用于月度统计和计费）
            recordUsage(tenantId, request, content, System.currentTimeMillis() - start);
            
            return LlmResponse.success(content);
            
        } finally {
            releaseConcurrentSlot(tenantId);
        }
    }
    
    /**
     * 基于Redis的滑动窗口限流
     */
    private boolean checkRateLimit(String tenantId, TenantConfig config) {
        String key = "llm:ratelimit:" + tenantId + ":" + 
                    System.currentTimeMillis() / 60000;  // 当前分钟的key
        
        Long current = redis.opsForValue().increment(key);
        if (current == 1L) {
            redis.expire(key, 90, TimeUnit.SECONDS);  // 90秒后过期
        }
        
        return current <= config.getMaxRequestsPerMinute();
    }
    
    /**
     * 基于Redis的并发控制（Semaphore）
     */
    private boolean acquireConcurrentSlot(String tenantId, TenantConfig config) {
        String key = "llm:concurrent:" + tenantId;
        
        // INCR并检查是否超过上限
        Long current = redis.opsForValue().increment(key);
        if (current == 1L) {
            redis.expire(key, 300, TimeUnit.SECONDS);  // 防止泄漏
        }
        
        if (current > config.getMaxConcurrentRequests()) {
            redis.opsForValue().decrement(key);  // 超限回滚
            return false;
        }
        
        return true;
    }
    
    private void releaseConcurrentSlot(String tenantId) {
        redis.opsForValue().decrement("llm:concurrent:" + tenantId);
    }
    
    private void recordUsage(String tenantId, ChatRequest request, 
                             String response, long latencyMs) {
        LlmUsageRecord record = LlmUsageRecord.builder()
            .tenantId(tenantId)
            .inputTokens(estimateTokens(request.getUserMessage()))
            .outputTokens(estimateTokens(response))
            .latencyMs(latencyMs)
            .timestamp(LocalDateTime.now())
            .build();
        
        // 异步写入，不影响响应时间
        CompletableFuture.runAsync(() -> usageRepository.save(record));
    }
    
    private int estimateTokens(String text) {
        // 简单估算：中文约1.5字/token，英文约4字/token
        return text.length() / 2;
    }
}

优先级队列实现

仅靠限流还不够，营销系统的批量任务即使被限速，还是可能占用所有的并发槽位。需要优先级队列来保证高优先级业务不排队：

/**
 * 带优先级的LLM请求队列
 * 
 * 优先级定义：
 * - CRITICAL(3)：实时客服对话，必须即时响应
 * - HIGH(2)：用户主动触发的交互
 * - NORMAL(1)：后台任务，可以等待
 * - LOW(0)：批量离线任务，只在空闲时处理
 */
@Service
@Slf4j
public class PriorityLlmQueue {
    
    // 使用优先队列，高优先级的先处理
    private final PriorityBlockingQueue<PrioritizedRequest> requestQueue = 
        new PriorityBlockingQueue<>(1000, 
            Comparator.comparingInt(PrioritizedRequest::getPriority).reversed()
                .thenComparingLong(PrioritizedRequest::getEnqueueTime));
    
    // 工作线程池，实际执行LLM调用
    private final ExecutorService workerPool = Executors.newFixedThreadPool(20);
    
    private final ChatClient llmClient;
    
    public PriorityLlmQueue(ChatClient llmClient) {
        this.llmClient = llmClient;
        // 启动队列处理线程
        startQueueProcessor();
    }
    
    /**
     * 提交LLM请求，返回Future等待结果
     */
    public CompletableFuture<String> submit(
            String tenantId, 
            ChatRequest request,
            RequestPriority priority) {
        
        CompletableFuture<String> future = new CompletableFuture<>();
        
        PrioritizedRequest pRequest = PrioritizedRequest.builder()
            .tenantId(tenantId)
            .request(request)
            .priority(priority.getValue())
            .enqueueTime(System.currentTimeMillis())
            .resultFuture(future)
            .build();
        
        // 入队前检查队列大小，防止内存溢出
        if (requestQueue.size() >= 1000) {
            if (priority == RequestPriority.LOW) {
                future.completeExceptionally(
                    new ServiceOverloadException("服务繁忙，请稍后重试"));
                return future;
            }
        }
        
        requestQueue.offer(pRequest);
        log.debug("请求入队: tenant={}, priority={}, queueSize={}", 
                 tenantId, priority, requestQueue.size());
        
        return future;
    }
    
    private void startQueueProcessor() {
        // 启动一个调度线程，不断从队列取请求并分配给工作线程
        Thread scheduler = new Thread(() -> {
            while (!Thread.currentThread().isInterrupted()) {
                try {
                    PrioritizedRequest request = requestQueue.poll(
                        100, TimeUnit.MILLISECONDS);
                    
                    if (request != null) {
                        long waitTime = System.currentTimeMillis() - request.getEnqueueTime();
                        
                        if (waitTime > 30000 && request.getPriority() >= RequestPriority.HIGH.getValue()) {
                            log.warn("高优先级请求等待超过30秒: tenant={}", request.getTenantId());
                        }
                        
                        workerPool.submit(() -> processRequest(request));
                    }
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    break;
                }
            }
        }, "llm-queue-scheduler");
        
        scheduler.setDaemon(true);
        scheduler.start();
    }
    
    private void processRequest(PrioritizedRequest pRequest) {
        try {
            String result = llmClient.prompt()
                .system(pRequest.getRequest().getSystemPrompt())
                .user(pRequest.getRequest().getUserMessage())
                .call()
                .content();
            
            pRequest.getResultFuture().complete(result);
            
        } catch (Exception e) {
            pRequest.getResultFuture().completeExceptionally(e);
        }
    }
    
    /**
     * 获取队列状态（用于监控）
     */
    public QueueStatus getStatus() {
        Map<Integer, Long> countByPriority = requestQueue.stream()
            .collect(Collectors.groupingBy(
                PrioritizedRequest::getPriority, Collectors.counting()));
        
        return QueueStatus.builder()
            .totalPending(requestQueue.size())
            .criticalPending(countByPriority.getOrDefault(3, 0L))
            .highPending(countByPriority.getOrDefault(2, 0L))
            .normalPending(countByPriority.getOrDefault(1, 0L))
            .lowPending(countByPriority.getOrDefault(0, 0L))
            .build();
    }
}

租户配置管理

/**
 * 租户配置：每个业务线的资源配额
 */
@Entity
@Data
@Builder
public class TenantConfig {
    
    @Id
    private String tenantId;
    
    private String tenantName;
    
    // 限流配置
    private int maxRequestsPerMinute;    // 每分钟最大请求数
    private int maxConcurrentRequests;   // 最大并发请求数
    private int maxTokensPerRequest;     // 单次最大token数
    private int maxTokensPerDay;         // 每日最大token数（成本控制）
    
    // 优先级配置
    private RequestPriority defaultPriority;  // 该租户的默认请求优先级
    
    // 业务线示例配置
    public static TenantConfig customerService() {
        return TenantConfig.builder()
            .tenantId("customer-service")
            .tenantName("客服系统")
            .maxRequestsPerMinute(200)
            .maxConcurrentRequests(30)
            .maxTokensPerRequest(2048)
            .maxTokensPerDay(2_000_000)
            .defaultPriority(RequestPriority.CRITICAL)  // 实时服务，最高优先级
            .build();
    }
    
    public static TenantConfig marketingBatch() {
        return TenantConfig.builder()
            .tenantId("marketing-batch")
            .tenantName("营销批量任务")
            .maxRequestsPerMinute(60)
            .maxConcurrentRequests(5)            // 并发上限低，不抢占资源
            .maxTokensPerRequest(1024)
            .maxTokensPerDay(5_000_000)          // 允许大量token，但并发受限
            .defaultPriority(RequestPriority.LOW) // 低优先级，在空闲时处理
            .build();
    }
}

监控与告警

多租户系统必须监控每个租户的使用情况：

@Service
@RequiredArgsConstructor
public class TenantUsageMonitor {
    
    private final LlmUsageRepository usageRepository;
    private final AlertService alertService;
    private final TenantConfigRepository tenantConfigRepo;
    
    /**
     * 每小时检查各租户的使用量，预警超额情况
     */
    @Scheduled(cron = "0 0 * * * *")
    public void checkDailyUsage() {
        LocalDate today = LocalDate.now();
        
        for (TenantConfig config : tenantConfigRepo.findAll()) {
            long todayTokens = usageRepository.sumTokensByTenantAndDate(
                config.getTenantId(), today);
            
            double usageRate = (double) todayTokens / config.getMaxTokensPerDay();
            
            if (usageRate > 0.8) {
                alertService.warn(String.format(
                    "租户[%s]今日token使用量已达%.1f%%（%d/%d），注意控制用量",
                    config.getTenantName(), usageRate * 100, 
                    todayTokens, config.getMaxTokensPerDay()));
            }
        }
    }
    
    /**
     * 生成每日使用报告
     */
    public TenantUsageReport getDailyReport(String tenantId, LocalDate date) {
        List<LlmUsageRecord> records = usageRepository.findByTenantAndDate(tenantId, date);
        
        return TenantUsageReport.builder()
            .tenantId(tenantId)
            .date(date)
            .totalRequests(records.size())
            .totalInputTokens(records.stream().mapToLong(LlmUsageRecord::getInputTokens).sum())
            .totalOutputTokens(records.stream().mapToLong(LlmUsageRecord::getOutputTokens).sum())
            .avgLatencyMs(records.stream().mapToLong(LlmUsageRecord::getLatencyMs).average().orElse(0))
            .p99LatencyMs(records.stream().mapToLong(LlmUsageRecord::getLatencyMs)
                .sorted().skip((long)(records.size() * 0.99)).findFirst().orElse(0))
            .build();
    }
}

多租户资源隔离的核心是公平性和预期可控。每个业务线知道自己的资源上限，关键业务有优先保障，批量任务不会影响实时服务。这个设计比"所有人共享、先来先得"更复杂，但也更稳定。