[Speculative Decoding] Unify Spec and non-spec branch (#6685)

* optimize spec-inference architecture * delete debug log * optimize spec_method usage && fix unit_test * add claude unit-test skill * fix some ugly bug * enhance robustness and bounds check * unify method & spec_method to method to avoid bug * activate CI * fix unit test * Unify logprobs computation for naive and speculative decoding, fix CUDA kernel * fix logprob bug && optimize verify kernel * fix exist_decode() judge
2026-04-23 00:17:25 +08:00 · 2026-03-11 14:58:44 +08:00
parent b6190de557
commit cf7934a4b2
41 changed files with 3428 additions and 392 deletions
@@ -14,13 +14,17 @@
 # limitations under the License.
 """

+from typing import TYPE_CHECKING
+
 import numpy as np

-from fastdeploy.config import FDConfig
 from fastdeploy.utils import spec_logger

 from .base import Proposer

+if TYPE_CHECKING:
+    from fastdeploy.config import FDConfig
+
 try:
    from arctic_inference.suffix_decoding import SuffixDecodingCache
 except ImportError:
@@ -34,7 +38,7 @@ class SuffixProposer(Proposer):
    Uses SuffixDecodingCache to generate draft tokens based on suffix tree matching.
    """

-    def __init__(self, fd_config: FDConfig):
+    def __init__(self, fd_config: "FDConfig"):
        super().__init__(fd_config)

        if SuffixDecodingCache is None: