FastDeploy/fastdeploy/input/encodings/base_encoding.py

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Abstract base class for multimodal encoding strategies.

Each encoding strategy handles model-family-specific logic such as
position ID computation, image/video preprocessing, and token counting.
New model families should subclass ``BaseEncoding`` and implement all
abstract methods.
"""

from abc import ABC, abstractmethod
from typing import Any, Dict, Optional, Tuple


class BaseEncoding(ABC):
    """Contract that every encoding strategy must fulfil.

    Required (abstract) methods cover the core encoding pipeline.
    Optional methods (``init_extra``, ``get_mm_max_tokens_per_item``) have
    default no-op implementations so subclasses only override when needed.
    """

    def __init__(self, processor, processor_kwargs=None):
        if processor_kwargs is None:
            processor_kwargs = {}
        cfg = processor.cfg

        # Shared objects (created by processor, used by encoding)
        self.cfg = cfg
        self.image_processor = processor.image_processor
        self.tokenizer = processor.tokenizer

        # Conv params
        if cfg.conv_params_from_kwargs:
            self.spatial_conv_size = processor_kwargs.get("spatial_conv_size", 2)
            self.temporal_conv_size = processor_kwargs.get("temporal_conv_size", 2)
        else:
            self.spatial_conv_size = self.image_processor.merge_size
            self.temporal_conv_size = self.image_processor.temporal_patch_size

        # Special token IDs
        self.image_token_id = self.tokenizer.convert_tokens_to_ids(cfg.image_token_str)
        self.video_token_id = self.tokenizer.convert_tokens_to_ids(cfg.video_token_str)
        if cfg.has_tokens_per_second:
            vision_config = getattr(getattr(processor, "config", None), "vision_config", None)
            self.tokens_per_second = getattr(vision_config, "tokens_per_second", 2)
        else:
            self.tokens_per_second = 2

        # Video params
        self.fps = processor_kwargs.get("video_fps", cfg.default_fps)
        self.min_frames = processor_kwargs.get("video_min_frames", cfg.default_min_frames)
        self.max_frames = processor_kwargs.get("video_max_frames", cfg.default_max_frames)
        self.target_frames = processor_kwargs.get("video_target_frames", cfg.default_target_frames)

        # Model-specific extra init
        self.init_extra(processor_kwargs)

    # ------------------------------------------------------------------
    # Image
    # ------------------------------------------------------------------
    @abstractmethod
    def add_image(self, img, outputs: dict, uuid, token_len=None):
        """Process a raw image and append results to *outputs*."""

    @abstractmethod
    def add_processed_image(self, img_cache, outputs: dict, uuid, token_len=None):
        """Append a pre-processed (cached) image to *outputs*."""

    # ------------------------------------------------------------------
    # Video
    # ------------------------------------------------------------------
    @abstractmethod
    def add_video(self, frames, outputs: dict, uuid, token_len=None, meta: Optional[dict] = None):
        """Process video frames and append results to *outputs*.

        Parameters
        ----------
        frames : array-like
            Decoded video frames.
        outputs : dict
            Mutable accumulator for input_ids, position_ids, etc.
        uuid : str | None
            Unique identifier for cache lookup.
        token_len : int | None
            Expected token count (for validation against pre-tokenised prompts).
        meta : dict | None
            Video metadata (fps, duration, ...).  Encoding strategies that
            need metadata (e.g. Qwen) read from this dict; those that don't
            (e.g. Ernie) simply ignore it.
        """

    @abstractmethod
    def add_processed_video(self, frames_cache, outputs: dict, uuid, token_len=None):
        """Append a pre-processed (cached) video to *outputs*."""

    @abstractmethod
    def load_video(self, url, item: dict) -> Tuple[Any, dict]:
        """Decode a video from *url* and return ``(frames, meta)``.

        All implementations must return a 2-tuple so that the caller
        (``MultiModalProcessor.text2ids``) can unpack uniformly.
        """

    # ------------------------------------------------------------------
    # Text / position helpers
    # ------------------------------------------------------------------
    @abstractmethod
    def add_text_positions(self, outputs: dict, num_tokens: int):
        """Append text position IDs to *outputs*."""

    @abstractmethod
    def append_completion_tokens(self, multimodal_inputs: dict, completion_token_ids):
        """Append completion token IDs (and their positions) to *multimodal_inputs*."""

    # ------------------------------------------------------------------
    # Prompt-token-ids path (optional — only models with
    # supports_prompt_token_ids=True need to implement this)
    # ------------------------------------------------------------------
    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None) -> dict:
        """Build outputs dict from pre-tokenised ``prompt_token_ids``.

        Parameters
        ----------
        prompt_token_ids : list[int]
            Pre-tokenised token IDs.
        mm_items : list[dict] | None
            Already-extracted multimodal items (each has 'type', 'data', 'uuid').
            ``None`` means text-only.
        """
        raise NotImplementedError(f"{type(self).__name__} does not support prompt_token_ids path")

    # ------------------------------------------------------------------
    # Token counting & packing
    # ------------------------------------------------------------------
    @staticmethod
    @abstractmethod
    def mm_num_tokens(grid_thw):
        """Return the number of multimodal tokens for a given grid_thw."""

    @abstractmethod
    def pack_position_ids(self, outputs: dict):
        """Convert intermediate position ID lists into final packed format."""

    # ------------------------------------------------------------------
    # Outputs initialisation
    # ------------------------------------------------------------------
    def _make_outputs(self) -> dict:
        """Create the mutable accumulator dict for encoding results.

        Subclasses override to add model-specific fields (e.g. fps, vit fields).
        """
        return {
            "input_ids": [],
            "token_type_ids": [],
            "position_ids": [],
            "images": [],
            "grid_thw": [],
            "image_type_ids": [],
            "labels": [],
            "cur_position": 0,
            "video_cnt": 0,
            "num_input_image_tokens": 0,
            "num_input_video_tokens": 0,
            "mm_positions": [],
            "mm_hashes": [],
        }

    # ------------------------------------------------------------------
    # Optional hooks — subclasses override only when needed
    # ------------------------------------------------------------------
    def init_extra(self, processor_kwargs: dict):
        """Model-specific extra initialisation (called once after ``__init__``)."""

    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Dict[str, int]]:
        """Per-modality max token counts for the scheduler.  ``None`` = not applicable."""
        return None