fix typos (#3684)

2026-05-10 01:21:55 +08:00 · 2025-09-01 17:50:17 +08:00
parent 0513a78ecc
commit d6369b4d51
67 changed files with 85 additions and 85 deletions
@@ -57,7 +57,7 @@ def parse_args():
        "--protocol",
        type=str,
        default="ipc",
-        help="cache transfer protocol, only surport ipc now",
+        help="cache transfer protocol, only support ipc now",
    )
    parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ")
    parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")
@@ -257,7 +257,7 @@ class ParallelConfig:
        self.sequence_parallel = False  # Whether to enable sequence parallelism.
        self.use_ep = False  # Whether to enable Expert Parallelism
        self.moe_phase = MoEPhase("prefill")  # Generation phase
-        self.msg_queue_id = 1  # mesage queue id
+        self.msg_queue_id = 1  # message queue id

        self.tensor_parallel_rank = 0  # TP rank ID
        self.tensor_parallel_size = 1  # TP degree
@@ -549,7 +549,7 @@ class GraphOptimizationConfig:
            It requires that all input buffers have fixed addresses, and all
            splitting ops write their outputs to input buffers.
            - With dyncmic graph backend: ...
-            - With static grpah backend: WIP
+            - With static graph backend: WIP
        """
        self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
        """  Number of warmup runs for SOT warmup. """
@@ -531,7 +531,7 @@ class EngineArgs:
            "--quantization",
            type=str,
            default=EngineArgs.quantization,
-            help="Quantization name for the model, currentlly support "
+            help="Quantization name for the model, currently support "
            "'wint8', 'wint4',"
            "default is None. The priority of this configuration "
            "is lower than that of the config file. "
@@ -829,7 +829,7 @@ class EngineArgs:
        scheduler_group.add_argument(
            "--scheduler-topic",
            default=EngineArgs.scheduler_topic,
-            help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)",
+            help=f"Topic of scheduler. Default is {EngineArgs.scheduler_topic}. (global)",
        )
        scheduler_group.add_argument(
            "--scheduler-min-load-score",
@@ -644,13 +644,13 @@ class EngineSevice:
                    self.zmq_server.send_multipart(request_id, [error_result])
            except Exception as e:
                llm_logger.error(
-                    f"Error happend while receving new request from zmq, details={e}, "
+                    f"Error happend while receiving new request from zmq, details={e}, "
                    f"traceback={traceback.format_exc()}"
                )

    def _zmq_send_generated_tokens(self):
        """
-        Recieve output for zmq
+        Receive output for zmq
        """
        while self.running:
            try:
@@ -458,7 +458,7 @@ class ResourceManagerV1(ResourceManager):

    def _free_blocks(self, request: Request):
        if self.config.cache_config.enable_prefix_caching:
-            # TODO(chengyanfu): support cache ouput blocks for prefix caching
+            # TODO(chengyanfu): support cache output blocks for prefix caching
            if request.get("prefill_block_num", None) is None:
                leaf_node = self.cache_manager.req_leaf_map[request.request_id]
                self.cache_manager.decrease_request_share_count(request.request_id)
@@ -112,7 +112,7 @@ class LLM:

    def _receive_output(self):
        """
-        Recieve output from token processor and store them in cache
+        Receive output from token processor and store them in cache
        """
        while True:
            try:
@@ -40,7 +40,7 @@ class ConcreteSizeEntry:
    # Has runtime-bs been captured before
    captured: bool = False

-    # Need to be captured callable object（dynamic graph or static grpah backend）
+    # Need to be captured callable object（dynamic graph or static graph backend）
    runnable: Callable = None  # type: ignore
    # Number of completed warmups
    num_finished_warmup: int = 0
@@ -117,9 +117,9 @@ class GraphOptBackend:

        self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
        if self.fd_config.graph_opt_config.graph_opt_level > 0:
-            # 1. Prepare cuda grpah input buffers (contain output of subgraphs)
+            # 1. Prepare cuda graph input buffers (contain output of subgraphs)

-            # 2. Convert dynamic grpah to static graph
+            # 2. Convert dynamic graph to static graph

            backend = (
                ToStaticBackend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else ToStaticBackend.PHI
@@ -193,7 +193,7 @@ class AppendAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
@@ -114,7 +114,7 @@ class BlockAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
@@ -176,7 +176,7 @@ class FlashAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
@@ -210,7 +210,7 @@ class IluvatarAttnBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        return (
            max_num_blocks,
@@ -130,7 +130,7 @@ class XPUAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ) -> Tuple[int, int, int, int]:
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        return (
            max_num_blocks,
@@ -170,7 +170,7 @@ class GCUFlashAttnBackend(AttentionBackend):
                cache_len = 0
            elif self.seq_lens_decoder_list[seq_idx][0] != 0:  # decode
                cache_len = self.seq_lens_decoder_list[seq_idx][0]
-            # else:  doesnot have req in this seq_idx
+            # else:  doesn't have req in this seq_idx

            if cache_len is not None:
                lens_this_time = self.seq_lens_this_time_list[seq_idx]
@@ -212,7 +212,7 @@ class GCUFlashAttnBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        # [total_tokens, kv_num_heads, head_dim]
        return (
@@ -171,7 +171,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
                cache_len = 0
            elif self.seq_lens_decoder_list[seq_idx][0] != 0:  # decode
                cache_len = self.seq_lens_decoder_list[seq_idx][0]
-            # else:  doesnot have req in this seq_idx
+            # else:  doesn't have req in this seq_idx

            if cache_len is not None:
                lens_this_time = self.seq_lens_this_time_list[seq_idx]
@@ -224,7 +224,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        # [total_tokens, kv_num_heads, head_dim]
        return (
@@ -137,7 +137,7 @@ class FlashAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
@@ -114,7 +114,7 @@ class DeepEPEngine:
                low_latency_mode=True,
                num_qps_per_rank=24,
            )
-        # In disaggregated mode on mutiple nodes, we either use
+        # In disaggregated mode on multiple nodes, we either use
        # high throughput mode or low latency mode.
        else:
            if moe_phase.phase == "decode":
@@ -35,7 +35,7 @@ class EarlyStopper:
    @abstractmethod
    def process(self, probs: paddle.Tensor, next_tokens: paddle.Tensor, stop_flags: paddle.Tensor):
        """
-        processs the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
+        process the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
        args:
            - probs: [batch_size, vocab_size], the probs of every sample
            - next_tokens: [batch_size, 1], the token index of every chosen sample
@@ -267,7 +267,7 @@ class TokenProcessor:
            spec_logger.info(
                f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
                f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
-                f" avarage accept len: {self.number_of_output_tokens / self.total_step}"
+                f" average accept len: {self.number_of_output_tokens / self.total_step}"
            )

            if self.cfg.speculative_config.method in ["mtp"]:
@@ -72,7 +72,7 @@ class Proposer(ABC):
    @abstractmethod
    def _run_impl(self, *args, **kwargs) -> Any:
        """
-        Implemention for different method
+        Implementation for different method
        """
        raise NotImplementedError

@@ -14,7 +14,7 @@
 # limitations under the License.
 """

-"""redundant expert manger."""
+"""redundant expert manager."""
 from typing import Optional, Tuple

 import numpy as np
@@ -49,7 +49,7 @@ class GcuWorker(WorkerBase):
    def init_device(self):
        """Initialize device and Construct model runner"""
        if paddle.is_compiled_with_custom_device("gcu"):
-            # Set evironment variable
+            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"gcu:{self.local_rank}"
            paddle.device.set_device(self.device)
@@ -127,7 +127,7 @@ class GcuWorker(WorkerBase):
        # NOTE(gongshaotian): may be not need warm_up at this place
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
-        # 2. Triger cuda grpah capture
+        # 2. Trigger cuda graph capture
        self.model_runner.capture_model()
        set_random_seed(self.fd_config.model_config.seed)

@@ -60,7 +60,7 @@ class GpuWorker(WorkerBase):
        """
        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
        if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda():
-            # Set evironment variable
+            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
            paddle.device.set_device(self.device)
@@ -169,7 +169,7 @@ class GpuWorker(WorkerBase):
            )
        )

-        return available_kv_cache_memory  # return to caculate the block num in this device
+        return available_kv_cache_memory  # return to calculate the block num in this device

    def load_model(self) -> None:
        """Load model"""
@@ -209,7 +209,7 @@ class GpuWorker(WorkerBase):
        """
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
-        # Triger cuda grpah capture
+        # Trigger cuda graph capture
        self.model_runner.capture_model()

    def check_health(self) -> bool:
@@ -51,7 +51,7 @@ class IluvatarWorker(GpuWorker):
        Initialize device and construct model runner
        """
        if paddle.is_compiled_with_custom_device("iluvatar_gpu"):
-            # Set evironment variable
+            # Set environment variable
            self.device = f"iluvatar_gpu:{self.local_rank}"
            paddle.device.set_device(self.device)
            paddle.set_default_dtype(self.parallel_config.dtype)
@@ -54,7 +54,7 @@ class MetaxWorker(WorkerBase):
        """
        self.max_chips_per_node = 8
        if paddle.is_compiled_with_custom_device("metax_gpu"):
-            # Set evironment variable
+            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"metax_gpu:{self.local_rank % self.max_chips_per_node}"
            paddle.device.set_device(self.device)
@@ -202,7 +202,7 @@ class MetaxWorker(WorkerBase):
        """
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
-        # Todo Triger cuda grpah capture.
+        # Todo Trigger cuda graph capture.

    def check_health(self) -> bool:
        """ """
@@ -21,7 +21,7 @@ import traceback
 def check_safetensors_model(model_dir: str):
    """
    model_dir : the directory of the model
-    Check whther the model is safetensors format
+    Check whether the model is safetensors format
    """
    model_files = list()
    all_files = os.listdir(model_dir)
@@ -27,7 +27,7 @@ from fastdeploy.worker.output import ModelRunnerOutput
 class WorkerBase(ABC):
    """
    Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model
-    Worker interface that allows inference framwork to cleanly separate implementations for different harware.
+    Worker interface that allows inference framework to cleanly separate implementations for different hardware.
    """

    def __init__(
@@ -89,7 +89,7 @@ class WorkerBase(ABC):

    @abstractmethod
    def graph_optimize_and_warm_up_model(self) -> None:
-        """Prepare model for execution through grpah optimizaiton(CudaGrpah/CINN) or warmup."""
+        """Prepare model for execution through graph optimizaiton(CudaGrpah/CINN) or warmup."""
        raise NotImplementedError

    @abstractmethod
@@ -249,7 +249,7 @@ class PaddleDisWorkerProc:
        )

    def event_loop_normal(self) -> None:
-        """Main event loop for Paddle Distrubuted Workers.
+        """Main event loop for Paddle Distributed Workers.
        TODO(gongshaotian): support remote calling of functions that control worker.
        """
        # Currently, only support single node
@@ -493,7 +493,7 @@ def parse_args():
        "--speculative_config",
        type=json.loads,
        default=None,
-        help="Configation of SpeculativeConfig.",
+        help="Configuration of SpeculativeConfig.",
    )
    parser.add_argument(
        "--max_num_batched_tokens",
@@ -542,7 +542,7 @@ def parse_args():
        "--quantization",
        type=str,
        default="None",
-        help="Quantization name for the model, currentlly support "
+        help="Quantization name for the model, currently support "
        "'wint4', 'wint8',"
        "default is None. The priority of this configuration "
        "is lower than that of the config file. "
@@ -552,7 +552,7 @@ def parse_args():
        "--graph_optimization_config",
        type=json.loads,
        default=None,
-        help="Configation of Graph optimization backend.",
+        help="Configuration of Graph optimization backend.",
    )
    parser.add_argument(
        "--moba_attention_config",
@@ -50,7 +50,7 @@ class XpuWorker(WorkerBase):
    def init_device(self):
        """Initialize device and Construct model runner"""
        if paddle.is_compiled_with_xpu():
-            # Set evironment variable
+            # Set environment variable
            self.device = f"xpu:{self.local_rank}"
            paddle.device.set_device(self.device)
            paddle.set_default_dtype(self.parallel_config.dtype)