Optimizing the performance of think length limit using custom operators (#4279)

* delete impl * delete min_length&max_length * support limit thinking content strategy * fix * fix * fix * update * fix set_value_by_flags_and_idx * fix * fix * fix * fix * update * fix * fix * fix typo * fix ci * fix * fix * support mtp * fix * fix * update * update
2026-04-23 00:17:25 +08:00 · 2025-10-20 21:09:13 +08:00
parent 36af88ff3f
commit cef3164c3b
31 changed files with 747 additions and 1032 deletions
@@ -12,7 +12,7 @@
 import importlib
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from enum import Enum
+from enum import IntFlag, auto
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple, Type, Union

@@ -26,20 +26,15 @@ from fastdeploy.config import (
    iter_architecture_defaults,
    try_match_architecture_defaults,
 )
-from fastdeploy.model_executor.models.interfaces_base import (
-    determine_model_category,
-    get_default_pooling_type,
-    is_multimodal_model,
-    is_pooling_model,
-    is_text_generation_model,
-)
+from fastdeploy.model_executor.models.interfaces_base import get_default_pooling_type


-class ModelCategory(Enum):
-    TEXT_GENERATION = "text_generation"
-    MULTIMODAL = "multimodal"
-    EMBEDDING = "embedding"
-    REWARD = "reward"
+class ModelCategory(IntFlag):
+    TEXT_GENERATION = auto()
+    MULTIMODAL = auto()
+    EMBEDDING = auto()
+    REASONING = auto()
+    REWARD = auto()


@dataclass(frozen=True)
@@ -48,18 +43,22 @@ class ModelInfo:
    category: ModelCategory
    is_text_generation: bool
    is_multimodal: bool
+    is_reasoning: bool
    is_pooling: bool
    module_path: str
    default_pooling_type: str

    @staticmethod
-    def from_model_cls(model_cls: Type[nn.Layer], module_path: str = "") -> "ModelInfo":
+    def from_model_cls(
+        model_cls: Type[nn.Layer], module_path: str = "", category: ModelCategory = None
+    ) -> "ModelInfo":
        return ModelInfo(
            architecture=model_cls.__name__,
-            category=determine_model_category(model_cls.__name__),
-            is_text_generation=is_text_generation_model(model_cls),
-            is_multimodal=is_multimodal_model(model_cls.__name__),
-            is_pooling=is_pooling_model(model_cls),
+            category=category,
+            is_text_generation=ModelCategory.TEXT_GENERATION in category,
+            is_multimodal=ModelCategory.MULTIMODAL in category,
+            is_reasoning=ModelCategory.REASONING in category,
+            is_pooling=ModelCategory.EMBEDDING in category,
            default_pooling_type=get_default_pooling_type(model_cls),
            module_path=module_path,
        )
@@ -84,6 +83,7 @@ class LazyRegisteredModel(BaseRegisteredModel):
    module_name: str
    module_path: str
    class_name: str
+    category: ModelCategory

    def load_model_cls(self) -> Type[nn.Layer]:
        try:
@@ -95,7 +95,7 @@ class LazyRegisteredModel(BaseRegisteredModel):

    def inspect_model_cls(self) -> ModelInfo:
        model_cls = self.load_model_cls()
-        return ModelInfo.from_model_cls(model_cls, self.module_name)
+        return ModelInfo.from_model_cls(model_cls, self.module_name, self.category)


@lru_cache(maxsize=128)
@@ -127,6 +127,7 @@ class ModelRegistry:
                module_name=model_info["module_name"],
                module_path=model_info["module_path"],
                class_name=model_info["class_name"],
+                category=model_info["category"],
            )
            self.models[arch] = model
            self._registered_models[arch] = model
@@ -317,6 +318,17 @@ class ModelRegistry:
                return model_info.is_multimodal
        return False

+    def is_reasoning_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
+        """Check if it's a reasoning model"""
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return model_info.is_reasoning
+        return False
+
    def is_text_generation_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
        """Check if it's a text generation model"""
        if isinstance(architectures, str):