第2028篇:LLM服务的多租户资源隔离——一台GPU服务多个业务线
2026/4/30大约 6 分钟
第2028篇:LLM服务的多租户资源隔离——一台GPU服务多个业务线
适读人群:需要在共享GPU资源上为多个业务线提供LLM服务的工程师 | 阅读时长:约17分钟 | 核心价值:设计公平、可控的多租户LLM服务,避免业务线之间互相影响
采购了一台A100 80GB,按说跑一个7B模型绰绰有余。但公司里有三个业务线——客服系统、营销系统、内部知识库——都要用LLM。
最简单的方案是共享:三个业务线都调用同一个LLM服务端点。
上线一个月后,营销系统搞了一个批量生成活动文案的功能,一次提交1000条任务,把GPU打满了,客服系统的实时对话全部超时。运营来投诉,说客服AI挂了。
这就是多租户资源隔离的必要性。
多租户的资源隔离层次
解决这个问题有几个层次,从简单到复杂:
大多数场景下,层次2(优先级队列)是最好的平衡点:成本不增加,同时保证关键业务的服务质量。
应用层限流实现
先实现最基础的每租户限流:
/**
* 多租户LLM请求管理器
* 每个租户(业务线)有独立的限流配置
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class TenantAwareLlmGateway {
private final ChatClient llmClient;
private final StringRedisTemplate redis;
private final TenantConfigRepository tenantConfigRepo;
/**
* 带租户隔离的LLM调用
*/
public LlmResponse call(String tenantId, ChatRequest request) {
TenantConfig config = tenantConfigRepo.findByTenantId(tenantId);
// 1. 限流检查
if (!checkRateLimit(tenantId, config)) {
throw new TenantRateLimitException(
String.format("租户[%s]请求超限,当前配额: %d req/min,请降低请求频率",
tenantId, config.getMaxRequestsPerMinute()));
}
// 2. 并发控制
if (!acquireConcurrentSlot(tenantId, config)) {
throw new TenantConcurrencyException(
String.format("租户[%s]并发超限,当前最大并发: %d",
tenantId, config.getMaxConcurrentRequests()));
}
long start = System.currentTimeMillis();
try {
String content = llmClient.prompt()
.system(request.getSystemPrompt())
.user(request.getUserMessage())
.options(ChatOptions.builder()
.maxTokens(Math.min(request.getMaxTokens(), config.getMaxTokensPerRequest()))
.build())
.call()
.content();
// 3. 记录使用量(用于月度统计和计费)
recordUsage(tenantId, request, content, System.currentTimeMillis() - start);
return LlmResponse.success(content);
} finally {
releaseConcurrentSlot(tenantId);
}
}
/**
* 基于Redis的滑动窗口限流
*/
private boolean checkRateLimit(String tenantId, TenantConfig config) {
String key = "llm:ratelimit:" + tenantId + ":" +
System.currentTimeMillis() / 60000; // 当前分钟的key
Long current = redis.opsForValue().increment(key);
if (current == 1L) {
redis.expire(key, 90, TimeUnit.SECONDS); // 90秒后过期
}
return current <= config.getMaxRequestsPerMinute();
}
/**
* 基于Redis的并发控制(Semaphore)
*/
private boolean acquireConcurrentSlot(String tenantId, TenantConfig config) {
String key = "llm:concurrent:" + tenantId;
// INCR并检查是否超过上限
Long current = redis.opsForValue().increment(key);
if (current == 1L) {
redis.expire(key, 300, TimeUnit.SECONDS); // 防止泄漏
}
if (current > config.getMaxConcurrentRequests()) {
redis.opsForValue().decrement(key); // 超限回滚
return false;
}
return true;
}
private void releaseConcurrentSlot(String tenantId) {
redis.opsForValue().decrement("llm:concurrent:" + tenantId);
}
private void recordUsage(String tenantId, ChatRequest request,
String response, long latencyMs) {
LlmUsageRecord record = LlmUsageRecord.builder()
.tenantId(tenantId)
.inputTokens(estimateTokens(request.getUserMessage()))
.outputTokens(estimateTokens(response))
.latencyMs(latencyMs)
.timestamp(LocalDateTime.now())
.build();
// 异步写入,不影响响应时间
CompletableFuture.runAsync(() -> usageRepository.save(record));
}
private int estimateTokens(String text) {
// 简单估算:中文约1.5字/token,英文约4字/token
return text.length() / 2;
}
}优先级队列实现
仅靠限流还不够,营销系统的批量任务即使被限速,还是可能占用所有的并发槽位。需要优先级队列来保证高优先级业务不排队:
/**
* 带优先级的LLM请求队列
*
* 优先级定义:
* - CRITICAL(3):实时客服对话,必须即时响应
* - HIGH(2):用户主动触发的交互
* - NORMAL(1):后台任务,可以等待
* - LOW(0):批量离线任务,只在空闲时处理
*/
@Service
@Slf4j
public class PriorityLlmQueue {
// 使用优先队列,高优先级的先处理
private final PriorityBlockingQueue<PrioritizedRequest> requestQueue =
new PriorityBlockingQueue<>(1000,
Comparator.comparingInt(PrioritizedRequest::getPriority).reversed()
.thenComparingLong(PrioritizedRequest::getEnqueueTime));
// 工作线程池,实际执行LLM调用
private final ExecutorService workerPool = Executors.newFixedThreadPool(20);
private final ChatClient llmClient;
public PriorityLlmQueue(ChatClient llmClient) {
this.llmClient = llmClient;
// 启动队列处理线程
startQueueProcessor();
}
/**
* 提交LLM请求,返回Future等待结果
*/
public CompletableFuture<String> submit(
String tenantId,
ChatRequest request,
RequestPriority priority) {
CompletableFuture<String> future = new CompletableFuture<>();
PrioritizedRequest pRequest = PrioritizedRequest.builder()
.tenantId(tenantId)
.request(request)
.priority(priority.getValue())
.enqueueTime(System.currentTimeMillis())
.resultFuture(future)
.build();
// 入队前检查队列大小,防止内存溢出
if (requestQueue.size() >= 1000) {
if (priority == RequestPriority.LOW) {
future.completeExceptionally(
new ServiceOverloadException("服务繁忙,请稍后重试"));
return future;
}
}
requestQueue.offer(pRequest);
log.debug("请求入队: tenant={}, priority={}, queueSize={}",
tenantId, priority, requestQueue.size());
return future;
}
private void startQueueProcessor() {
// 启动一个调度线程,不断从队列取请求并分配给工作线程
Thread scheduler = new Thread(() -> {
while (!Thread.currentThread().isInterrupted()) {
try {
PrioritizedRequest request = requestQueue.poll(
100, TimeUnit.MILLISECONDS);
if (request != null) {
long waitTime = System.currentTimeMillis() - request.getEnqueueTime();
if (waitTime > 30000 && request.getPriority() >= RequestPriority.HIGH.getValue()) {
log.warn("高优先级请求等待超过30秒: tenant={}", request.getTenantId());
}
workerPool.submit(() -> processRequest(request));
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}, "llm-queue-scheduler");
scheduler.setDaemon(true);
scheduler.start();
}
private void processRequest(PrioritizedRequest pRequest) {
try {
String result = llmClient.prompt()
.system(pRequest.getRequest().getSystemPrompt())
.user(pRequest.getRequest().getUserMessage())
.call()
.content();
pRequest.getResultFuture().complete(result);
} catch (Exception e) {
pRequest.getResultFuture().completeExceptionally(e);
}
}
/**
* 获取队列状态(用于监控)
*/
public QueueStatus getStatus() {
Map<Integer, Long> countByPriority = requestQueue.stream()
.collect(Collectors.groupingBy(
PrioritizedRequest::getPriority, Collectors.counting()));
return QueueStatus.builder()
.totalPending(requestQueue.size())
.criticalPending(countByPriority.getOrDefault(3, 0L))
.highPending(countByPriority.getOrDefault(2, 0L))
.normalPending(countByPriority.getOrDefault(1, 0L))
.lowPending(countByPriority.getOrDefault(0, 0L))
.build();
}
}租户配置管理
/**
* 租户配置:每个业务线的资源配额
*/
@Entity
@Data
@Builder
public class TenantConfig {
@Id
private String tenantId;
private String tenantName;
// 限流配置
private int maxRequestsPerMinute; // 每分钟最大请求数
private int maxConcurrentRequests; // 最大并发请求数
private int maxTokensPerRequest; // 单次最大token数
private int maxTokensPerDay; // 每日最大token数(成本控制)
// 优先级配置
private RequestPriority defaultPriority; // 该租户的默认请求优先级
// 业务线示例配置
public static TenantConfig customerService() {
return TenantConfig.builder()
.tenantId("customer-service")
.tenantName("客服系统")
.maxRequestsPerMinute(200)
.maxConcurrentRequests(30)
.maxTokensPerRequest(2048)
.maxTokensPerDay(2_000_000)
.defaultPriority(RequestPriority.CRITICAL) // 实时服务,最高优先级
.build();
}
public static TenantConfig marketingBatch() {
return TenantConfig.builder()
.tenantId("marketing-batch")
.tenantName("营销批量任务")
.maxRequestsPerMinute(60)
.maxConcurrentRequests(5) // 并发上限低,不抢占资源
.maxTokensPerRequest(1024)
.maxTokensPerDay(5_000_000) // 允许大量token,但并发受限
.defaultPriority(RequestPriority.LOW) // 低优先级,在空闲时处理
.build();
}
}监控与告警
多租户系统必须监控每个租户的使用情况:
@Service
@RequiredArgsConstructor
public class TenantUsageMonitor {
private final LlmUsageRepository usageRepository;
private final AlertService alertService;
private final TenantConfigRepository tenantConfigRepo;
/**
* 每小时检查各租户的使用量,预警超额情况
*/
@Scheduled(cron = "0 0 * * * *")
public void checkDailyUsage() {
LocalDate today = LocalDate.now();
for (TenantConfig config : tenantConfigRepo.findAll()) {
long todayTokens = usageRepository.sumTokensByTenantAndDate(
config.getTenantId(), today);
double usageRate = (double) todayTokens / config.getMaxTokensPerDay();
if (usageRate > 0.8) {
alertService.warn(String.format(
"租户[%s]今日token使用量已达%.1f%%(%d/%d),注意控制用量",
config.getTenantName(), usageRate * 100,
todayTokens, config.getMaxTokensPerDay()));
}
}
}
/**
* 生成每日使用报告
*/
public TenantUsageReport getDailyReport(String tenantId, LocalDate date) {
List<LlmUsageRecord> records = usageRepository.findByTenantAndDate(tenantId, date);
return TenantUsageReport.builder()
.tenantId(tenantId)
.date(date)
.totalRequests(records.size())
.totalInputTokens(records.stream().mapToLong(LlmUsageRecord::getInputTokens).sum())
.totalOutputTokens(records.stream().mapToLong(LlmUsageRecord::getOutputTokens).sum())
.avgLatencyMs(records.stream().mapToLong(LlmUsageRecord::getLatencyMs).average().orElse(0))
.p99LatencyMs(records.stream().mapToLong(LlmUsageRecord::getLatencyMs)
.sorted().skip((long)(records.size() * 0.99)).findFirst().orElse(0))
.build();
}
}多租户资源隔离的核心是公平性和预期可控。每个业务线知道自己的资源上限,关键业务有优先保障,批量任务不会影响实时服务。这个设计比"所有人共享、先来先得"更复杂,但也更稳定。
