mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] Support KV Cache Storage (#5571)
* Support Mooncake Store * up * up * add op * fix conflict * fix error * up for comments * avoid thread lock * up * fix unittest * fix unittest * remove debug info * consider tp_size > 1 * add default rdma_nics * add utils * up * fix error --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
@@ -670,7 +670,8 @@ class ResourceManagerV1(ResourceManager):
|
||||
request, self.config.cache_config.block_size, request.num_computed_tokens
|
||||
)
|
||||
req_index += 1
|
||||
# schedule the WAITING requests.
|
||||
|
||||
# Second, schedule the WAITING requests.
|
||||
if not preempted_reqs:
|
||||
skip_requests: list[Request] = []
|
||||
while self.waiting and token_budget > 0:
|
||||
@@ -699,10 +700,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
self._update_mm_hashes(request)
|
||||
# Enable prefix caching
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
if (
|
||||
self.config.cache_config.enable_hierarchical_cache
|
||||
and self.cache_manager.num_cpu_blocks > 0
|
||||
):
|
||||
if self.cache_manager.num_cpu_blocks > 0:
|
||||
if not self.cache_manager.can_allocate_gpu_blocks(
|
||||
(request.need_prefill_tokens + self.config.cache_config.block_size - 1)
|
||||
// self.config.cache_config.block_size
|
||||
@@ -713,12 +711,21 @@ class ResourceManagerV1(ResourceManager):
|
||||
self._free_blocks(request)
|
||||
break
|
||||
|
||||
# Allocate blocks for the tokens that does not hit cache
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||
# Allocate blocks to prefill
|
||||
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
|
||||
if not request.get("skip_allocate", False):
|
||||
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
|
||||
extra_gpu_block_ids = self.cache_manager.allocate_gpu_blocks(num_new_block)
|
||||
request.block_tables.extend(extra_gpu_block_ids)
|
||||
if (
|
||||
self.config.cache_config.enable_prefix_caching
|
||||
and self.config.cache_config.kvcache_storage_backend
|
||||
and num_new_tokens >= self.config.cache_config.block_size
|
||||
):
|
||||
matched_block_ids = self.get_storage_cached_blocks(request, extra_gpu_block_ids)
|
||||
num_new_tokens -= len(matched_block_ids) * self.config.cache_config.block_size
|
||||
|
||||
self.waiting.popleft()
|
||||
self.running.append(request)
|
||||
scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
|
||||
@@ -744,10 +751,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
request.num_total_tokens
|
||||
) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
if (
|
||||
self.config.cache_config.enable_hierarchical_cache
|
||||
and self.cache_manager.num_cpu_blocks > 0
|
||||
):
|
||||
if self.cache_manager.num_cpu_blocks > 0:
|
||||
if not self.cache_manager.can_allocate_gpu_blocks(
|
||||
(request.need_prefill_tokens + self.config.cache_config.block_size - 1)
|
||||
// self.config.cache_config.block_size
|
||||
@@ -757,12 +761,22 @@ class ResourceManagerV1(ResourceManager):
|
||||
if not success:
|
||||
self._free_blocks(request)
|
||||
break
|
||||
|
||||
# Allocate blocks for the tokens that does not hit cache
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||
# Allocate blocks to prefill
|
||||
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
|
||||
if not request.get("skip_allocate", False):
|
||||
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
|
||||
extra_gpu_block_ids = self.cache_manager.allocate_gpu_blocks(num_new_block)
|
||||
request.block_tables.extend(extra_gpu_block_ids)
|
||||
if (
|
||||
self.config.cache_config.enable_prefix_caching
|
||||
and self.config.cache_config.kvcache_storage_backend
|
||||
and num_new_tokens >= self.config.cache_config.block_size
|
||||
):
|
||||
matched_block_ids = self.get_storage_cached_blocks(request, extra_gpu_block_ids)
|
||||
num_new_tokens -= len(matched_block_ids) * self.config.cache_config.block_size
|
||||
|
||||
self.waiting.popleft()
|
||||
self.running.append(request)
|
||||
scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
|
||||
@@ -916,28 +930,61 @@ class ResourceManagerV1(ResourceManager):
|
||||
)
|
||||
|
||||
request.num_cached_tokens = matched_token_num
|
||||
request.gpu_cache_token_num = hit_info["gpu_match_token_num"]
|
||||
request.cpu_cache_token_num = hit_info["cpu_match_token_num"]
|
||||
request.cache_info = (matched_block_num, no_cache_block_num)
|
||||
request.metrics.gpu_cache_token_num = hit_info["gpu_match_token_num"]
|
||||
request.metrics.cpu_cache_token_num = hit_info["cpu_match_token_num"]
|
||||
request.cache_info = [matched_block_num, no_cache_block_num]
|
||||
request.block_tables = common_block_ids
|
||||
request.skip_allocate = False
|
||||
|
||||
# Report the number of cached tokens to Prometheus metrics
|
||||
main_process_metrics.prefix_cache_token_num.inc(matched_token_num)
|
||||
main_process_metrics.prefix_gpu_cache_token_num.inc(request.gpu_cache_token_num)
|
||||
main_process_metrics.prefix_cpu_cache_token_num.inc(request.cpu_cache_token_num)
|
||||
main_process_metrics.prefix_gpu_cache_token_num.inc(request.metrics.gpu_cache_token_num)
|
||||
main_process_metrics.prefix_cpu_cache_token_num.inc(request.metrics.gpu_cache_token_num)
|
||||
|
||||
if matched_token_num == request.need_prefill_tokens:
|
||||
request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size
|
||||
request.skip_allocate = True
|
||||
else:
|
||||
request.num_computed_tokens = matched_token_num
|
||||
request.cache_prepare_time = time.time() - cache_prepare_time
|
||||
request.metrics.gpu_cpu_cache_prepare_time = time.time() - cache_prepare_time
|
||||
return True
|
||||
except Exception as e:
|
||||
llm_logger.error(f"prefix match blocks error: {e}, {str(traceback.format_exc())} waiting reschedule...")
|
||||
return False
|
||||
|
||||
def get_storage_cached_blocks(self, request: Request, extra_gpu_block_ids: list = []):
|
||||
"""
|
||||
Match and prefetch the cached blocks from the storage backend.
|
||||
TODO: merge this function into get_prefix_cached_blocks
|
||||
"""
|
||||
try:
|
||||
tic = time.time()
|
||||
req_id = request.request_id
|
||||
llm_logger.debug(f"get_storage_cached_blocks start process req {req_id}")
|
||||
matched_block_ids = self.cache_manager.request_match_storage_blocks(request, extra_gpu_block_ids)
|
||||
llm_logger.debug(
|
||||
f"matched {len(matched_block_ids)} blocks from storage for req_id:{req_id}, "
|
||||
f"cost_time: {time.time() - tic:.6f}s"
|
||||
)
|
||||
|
||||
matched_token_num = len(matched_block_ids) * self.config.cache_config.block_size
|
||||
request.metrics.storage_cache_token_num = matched_token_num
|
||||
request.num_computed_tokens += matched_token_num
|
||||
if request.num_computed_tokens == request.need_prefill_tokens:
|
||||
request.num_computed_tokens = request.num_computed_tokens - self.config.cache_config.block_size
|
||||
request.metrics.storage_cache_prepare_time = time.time() - tic
|
||||
request.cache_info[0] += len(matched_block_ids) # matched_block_num
|
||||
request.cache_info[1] -= len(matched_block_ids) # no_cache_block_num
|
||||
|
||||
main_process_metrics.prefix_cache_token_num.inc(matched_token_num)
|
||||
# TODO: main_process_metrics.prefix_storage_cache_token_num.inc(matched_token_num)
|
||||
return matched_block_ids
|
||||
except Exception as e:
|
||||
llm_logger.error(
|
||||
f"get_storage_cached_blocks process req {req_id}, error: {e}, {str(traceback.format_exc())} "
|
||||
)
|
||||
return []
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
with self.lock:
|
||||
self.apply_async_preprocess(request)
|
||||
@@ -980,7 +1027,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num # consider for mtp, plus enc_dec_block_num
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
# Enable prefix caching
|
||||
if self.config.cache_config.enable_hierarchical_cache and self.cache_manager.num_cpu_blocks > 0:
|
||||
if self.cache_manager.num_cpu_blocks > 0:
|
||||
if not self.cache_manager.can_allocate_gpu_blocks(
|
||||
need_prealloc_prefill_blocks
|
||||
): # to prevent block allocation for matching in hierarchical cache and cause dead lock
|
||||
@@ -992,7 +1039,10 @@ class ResourceManagerV1(ResourceManager):
|
||||
|
||||
need_extra_prefill_blocks = need_prealloc_prefill_blocks - request.cache_info[0]
|
||||
if self.cache_manager.can_allocate_gpu_blocks(need_extra_prefill_blocks):
|
||||
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(need_extra_prefill_blocks))
|
||||
extra_gpu_block_ids = self.cache_manager.allocate_gpu_blocks(need_extra_prefill_blocks)
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
self.get_storage_cached_blocks(request, extra_gpu_block_ids)
|
||||
request.block_tables.extend(extra_gpu_block_ids)
|
||||
allocated_position = self.get_available_position()
|
||||
request.idx = allocated_position
|
||||
self.tasks_list[request.idx] = request
|
||||
@@ -1113,39 +1163,45 @@ class ResourceManagerV1(ResourceManager):
|
||||
def finish_requests(self, request_ids: Union[str, Iterable[str]]):
|
||||
llm_logger.info(f"recycle resources for requests: {request_ids}")
|
||||
try:
|
||||
if isinstance(request_ids, str):
|
||||
request_ids = (request_ids,)
|
||||
else:
|
||||
request_ids = set(request_ids)
|
||||
|
||||
need_postprocess_reqs = []
|
||||
with self.lock:
|
||||
if isinstance(request_ids, str):
|
||||
request_ids = (request_ids,)
|
||||
else:
|
||||
request_ids = set(request_ids)
|
||||
for req_id in request_ids:
|
||||
request = self.requests.get(req_id)
|
||||
if request is None:
|
||||
# Invalid request ID.
|
||||
continue
|
||||
if request in self.running: # normally run and finished
|
||||
if request in self.waiting:
|
||||
llm_logger.error(f"request {request.request_id} scheduled into waiting list, after finished")
|
||||
continue
|
||||
if request in self.running:
|
||||
self.running.remove(request)
|
||||
request.status = RequestStatus.FINISHED
|
||||
try:
|
||||
self._free_blocks(request)
|
||||
except Exception as e:
|
||||
llm_logger.warning(f"release block failed {req_id}: {e}")
|
||||
if (
|
||||
request.request_id in self.to_be_rescheduled_request_id_set
|
||||
): # finished after preempted, blocks have been recycled.
|
||||
self.to_be_rescheduled_request_id_set.remove(
|
||||
request.request_id
|
||||
) # just remove from to_be_rescheduled_request_id_set
|
||||
if (
|
||||
request in self.waiting
|
||||
): # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here
|
||||
raise RuntimeError(f"request {request.request_id} scheduled into waiting list, after finished")
|
||||
need_postprocess_reqs.append(request)
|
||||
if request.request_id in self.to_be_rescheduled_request_id_set:
|
||||
# finished after preempted, blocks have been recycled.
|
||||
self.to_be_rescheduled_request_id_set.remove(request.request_id)
|
||||
|
||||
self.tasks_list[request.idx] = None
|
||||
self.stop_flags[request.idx] = True
|
||||
del self.requests[req_id]
|
||||
if req_id in self.req_dict:
|
||||
del self.req_dict[req_id]
|
||||
|
||||
# Do not block the main thread here
|
||||
for req in need_postprocess_reqs:
|
||||
self.cache_manager.write_cache_to_storage(req)
|
||||
|
||||
with self.lock:
|
||||
for req in need_postprocess_reqs:
|
||||
try:
|
||||
self._free_blocks(req)
|
||||
except Exception as e:
|
||||
llm_logger.warning(f"release block failed {req.request_id}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
llm_logger.error(f"finish_request err: {e}, {str(traceback.format_exc())}")
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user