mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
Split enable_mm (#7183)
Co-authored-by: liuruian <liuruian@MacBook-Pro.local>
This commit is contained in:
@@ -294,6 +294,7 @@ class AsyncLLM(EngineServiceClient):
|
||||
cfg.limit_mm_per_prompt,
|
||||
cfg.mm_processor_kwargs,
|
||||
cfg.tool_parser,
|
||||
enable_mm_runtime=cfg.enable_mm_runtime,
|
||||
)
|
||||
# Create data processor
|
||||
self.data_processor = self.input_processor.create_processor()
|
||||
@@ -446,7 +447,7 @@ class AsyncLLM(EngineServiceClient):
|
||||
)
|
||||
if envs.ZMQ_SEND_BATCH_DATA and self.connection_manager is not None:
|
||||
request["zmq_worker_pid"] = self.connection_manager.worker_pid
|
||||
if self.cfg.model_config.enable_mm:
|
||||
if self.cfg.enable_mm_runtime:
|
||||
self.request_client.send_pyobj(request)
|
||||
else:
|
||||
self.request_client.send_json(request)
|
||||
|
||||
@@ -330,6 +330,7 @@ class EngineService:
|
||||
self.cfg.limit_mm_per_prompt,
|
||||
self.cfg.mm_processor_kwargs,
|
||||
self.cfg.tool_parser,
|
||||
enable_mm_runtime=self.cfg.enable_mm_runtime,
|
||||
)
|
||||
self.data_processor = self.input_processor.create_processor()
|
||||
self.mm_max_tokens_per_item = self.data_processor.get_mm_max_tokens_per_item(
|
||||
@@ -601,7 +602,7 @@ class EngineService:
|
||||
LoggingEventName.RESCHEDULED_INFERENCE_START, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
if not is_prefill:
|
||||
if not self.cfg.model_config.enable_mm:
|
||||
if not self.cfg.enable_mm_runtime:
|
||||
self.update_requests_chunk_size(tasks)
|
||||
else:
|
||||
self.update_mm_requests_chunk_size(tasks)
|
||||
@@ -1217,7 +1218,7 @@ class EngineService:
|
||||
while self.running:
|
||||
try:
|
||||
block = True if len(added_requests) == 0 else False
|
||||
if not self.cfg.model_config.enable_mm:
|
||||
if not self.cfg.enable_mm_runtime:
|
||||
err, data = self.recv_request_server.receive_json_once(block)
|
||||
else:
|
||||
err, data = self.recv_request_server.receive_pyobj_once(block)
|
||||
@@ -1275,6 +1276,7 @@ class EngineService:
|
||||
err_msg = None
|
||||
try:
|
||||
request = Request.from_dict(data)
|
||||
|
||||
request.metrics.scheduler_recv_req_time = time.time()
|
||||
main_process_metrics.requests_number.inc()
|
||||
trace_carrier = data.get("trace_carrier")
|
||||
@@ -2377,7 +2379,7 @@ class EngineService:
|
||||
if self.cfg.scheduler_config.splitwise_role == "prefill":
|
||||
variables["FLAGS_fmt_write_cache_completed_signal"] = 1
|
||||
|
||||
if self.cfg.model_config.enable_mm:
|
||||
if self.cfg.enable_mm_runtime:
|
||||
variables["FLAGS_max_partition_size"] = 1024
|
||||
|
||||
command_prefix = ""
|
||||
|
||||
@@ -205,11 +205,11 @@ class ResourceManagerV1(ResourceManager):
|
||||
self.need_block_num_map = dict()
|
||||
|
||||
self.encoder_cache = None
|
||||
if config.model_config.enable_mm and config.cache_config.max_encoder_cache > 0:
|
||||
if config.enable_mm_runtime and config.cache_config.max_encoder_cache > 0:
|
||||
self.encoder_cache = EncoderCacheManager(config.cache_config.max_encoder_cache)
|
||||
|
||||
self.processor_cache = None
|
||||
if config.model_config.enable_mm and config.cache_config.max_processor_cache > 0:
|
||||
if config.enable_mm_runtime and config.cache_config.max_processor_cache > 0:
|
||||
max_processor_cache_in_bytes = int(config.cache_config.max_processor_cache * 1024 * 1024 * 1024)
|
||||
self.processor_cache = ProcessorCacheManager(max_processor_cache_in_bytes)
|
||||
|
||||
@@ -550,7 +550,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
num_new_tokens = token_budget // self.config.cache_config.block_size * self.config.cache_config.block_size
|
||||
request.with_image = False
|
||||
|
||||
if not self.config.model_config.enable_mm:
|
||||
if not self.config.enable_mm_runtime:
|
||||
return num_new_tokens
|
||||
|
||||
inputs = request.multimodal_inputs
|
||||
|
||||
Reference in New Issue
Block a user