[Optimization] xgrammar async compile, multi thread, speed up (#4835)

* xgrammar async compile, multi thread, speed up

* fix test_sampler.py & pre-commit err

* add redis version check && fix request.llm_engine_recv_req_timestamp

* xgrammar prefill & decode & v0

* fix test_gpu_prompt_logprobs.py

* add test_guided_decoding.py

* Update fastdeploy/scheduler/splitwise_scheduler.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/model_executor/guided_decoding/xgrammar_backend.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/model_executor/guided_decoding/xgrammar_backend.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix torch xgrammar unittest env

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Daci
2025-11-14 18:05:26 +08:00
committed by GitHub
parent b925533051
commit 5fc12eddfe
11 changed files with 810 additions and 373 deletions
@@ -706,10 +706,30 @@ class InferScheduler:
self.reqs_queue = deque()
self.writers = []
def check_redis_version(self):
# Get Redis version information
redis_info = self.client.info()
redis_version = redis_info.get("redis_version", "")
version_parts = [int(x) for x in redis_version.split(".")]
# Redis 6.2 and above versions support RPOP with count parameter
assert (
version_parts[0] >= 6
), f"Redis major version too low: {version_parts[0]}. Please upgrade to Redis 6.2+ to support batch RPOP operations."
assert (
version_parts[1] >= 2 if version_parts[0] == 6 else True
), f"Redis version {redis_version} too low. Please upgrade to Redis 6.2+ to support batch RPOP operations."
logger.info(f"Redis version {redis_version} detected. Using native batch RPOP.")
def start(self, role, host, disaggregated):
"""
start backup threads
"""
# Check Redis version first
self.check_redis_version()
for i in range(self.writer_parallel):
writer = ResultWriter(self.client, i, self.writer_batch_size, self.ttl)
writer.start()