diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index ca2c2ba117..ec4f7fbe6f 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -185,7 +185,7 @@ jobs: -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}" set +e rm -rf ./baseline_output - cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output + cp -r baseline_dev/ERNIE-4.5-0.3B-Paddle ./baseline_output LOGPROB_EXIT_CODE=0 python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$? echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 08af9a0d7a..7321647efe 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -1268,18 +1268,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("routed_scaling_factor"), "ep moe export combine function"); - m.def("per_token_quant", - &PerTokenQuant, - py::arg("input"), - py::arg("block_size"), - "per token per block quant"); - - m.def("per_token_quant_padding", - &PerTokenQuantPadding, - py::arg("input"), - py::arg("block_size"), - "per token per block quant and padding transpose scale"); - m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"), diff --git a/custom_ops/gpu_ops/per_token_quant_fp8.cu b/custom_ops/gpu_ops/per_token_quant_fp8.cu index 255265e567..7059d17b09 100644 --- a/custom_ops/gpu_ops/per_token_quant_fp8.cu +++ b/custom_ops/gpu_ops/per_token_quant_fp8.cu @@ -16,313 +16,6 @@ constexpr float epsilon = 1e-10; -template -__global__ void quant_per_token_per_block( - const T *input, - phi::dtype::float8_e4m3fn *quanted_res, - float *quanted_scale, - const int token_num, - const int hidden_size, - const int hidden_size_scale, - const bool use_finegrained_range) { - const int bid = blockIdx.x; - const int tid = threadIdx.x; - const int warp_id = tid / 32; - const int lane_id = tid % 32; - const int num_warp = blockDim.x / 32; - static constexpr int NUM_PER_THREADS = 128 / 32; // 4 - static constexpr float MAX_VALUE = 448.f; - // Note(ZKK) use ceil_div!! - const int end_iter = (hidden_size + 127) / 128; // warp_iter_num - AlignedVector load_vec; - AlignedVector load_vec_float; - AlignedVector res_vec; - for (int token_idx = bid; token_idx < token_num; token_idx += gridDim.x) { - const T *input_now = input + token_idx * hidden_size; - phi::dtype::float8_e4m3fn *quanted_res_now = - quanted_res + token_idx * hidden_size; - float *quanted_scale_now = quanted_scale + token_idx * hidden_size_scale; - // deal a block per warp - for (int iter = warp_id; iter < end_iter; iter += num_warp) { - const int start_offset = iter * 128; - - const bool is_valid_data = - start_offset + lane_id * NUM_PER_THREADS < hidden_size; - - if (is_valid_data) { - Load( - input_now + start_offset + lane_id * NUM_PER_THREADS, &load_vec); - } else { -#pragma unroll - for (int vid = 0; vid < NUM_PER_THREADS; vid++) load_vec[vid] = T(0.f); - } - // get max value per thread - float max_value_thread = -5e4; -#pragma unroll - for (int vid = 0; vid < NUM_PER_THREADS; vid++) { - load_vec_float[vid] = static_cast(load_vec[vid]); - max_value_thread = max(abs(load_vec_float[vid]), max_value_thread); - } - // get max value per warp - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 16), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 8), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 4), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 2), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 1), - max_value_thread); - // broadcast max_value - max_value_thread = __shfl_sync(0xFFFFFFFF, max_value_thread, 0); - max_value_thread = max(max_value_thread, epsilon); - - if (use_finegrained_range) { - max_value_thread *= 7.0f; - } - - float scale_to_store = max_value_thread / MAX_VALUE; - // quant -#pragma unroll - for (int vid = 0; vid < NUM_PER_THREADS; vid++) { - res_vec[vid] = static_cast( - load_vec_float[vid] * MAX_VALUE / max_value_thread); - } - // store - if (is_valid_data) - Store( - res_vec, - quanted_res_now + start_offset + lane_id * NUM_PER_THREADS); - if (lane_id == 0) { - quanted_scale_now[iter] = scale_to_store; - } - } - } -} - -std::vector PerTokenQuant(paddle::Tensor &input, - const int block_size) { - auto input_dim = input.dims(); - const int token_num = input_dim[0]; - const int hidden_size = input_dim[1]; - // Note(ZKK) here we use ceil_dive to support 4.5T runing on 8 GPUS - // where moe_intermediate_size is 448, can not be divided by 128. - const int hidden_size_scale = (hidden_size + block_size - 1) / block_size; - - auto quanted_x = GetEmptyTensor( - {token_num, hidden_size}, paddle::DataType::FLOAT8_E4M3FN, input.place()); - auto quanted_scale = GetEmptyTensor( - {token_num, hidden_size_scale}, paddle::DataType::FLOAT32, input.place()); - const int gridx = min(132 * 8, token_num); - const int blockx = min(1024, hidden_size / 128 * 32); - - bool use_finegrained_range = false; - char *env_var = getenv("PER_TOKEN_QUANT_FP8_USE_FINEGRAINED_RANGE"); - if (env_var) { - use_finegrained_range = static_cast(std::stoi(env_var)); - } - - switch (input.dtype()) { - case paddle::DataType::BFLOAT16: - quant_per_token_per_block<<>>( - input.data(), - quanted_x.data(), - quanted_scale.data(), - token_num, - hidden_size, - hidden_size_scale, - use_finegrained_range); - break; - case paddle::DataType::FLOAT16: - quant_per_token_per_block<<>>( - input.data(), - quanted_x.data(), - quanted_scale.data(), - token_num, - hidden_size, - hidden_size_scale, - use_finegrained_range); - break; - default: - PD_THROW("Unsupported data type for PerTokenQuant"); - } - return {quanted_x, quanted_scale}; -} - -std::vector> PerTokenQuantInferShape( - std::vector input_shape, const int block_size) { - const int token_num = input_shape[0]; - const int hidden_size = input_shape[1]; - const int hidden_size_scale = (hidden_size + block_size - 1) / block_size; - return {{token_num, hidden_size}, {token_num, hidden_size_scale}}; -} - -std::vector PerTokenQuantInferDtype( - paddle::DataType input_dtype, const int block_size) { - return {paddle::DataType::FLOAT8_E4M3FN, paddle::DataType::FLOAT32}; -} - -template -__global__ void quant_per_token_per_block_padding( - const T *input, - phi::dtype::float8_e4m3fn *quanted_res, - float *quanted_scale, - const int token_num, - const int padded_token_num, - const int hidden_size, - const int hidden_size_scale, - const bool use_finegrained_range) { - const int bid = blockIdx.x; - const int tid = threadIdx.x; - const int warp_id = tid / 32; - const int lane_id = tid % 32; - const int num_warp = blockDim.x / 32; - static constexpr int NUM_PER_THREADS = 128 / 32; // 4 - static constexpr float MAX_VALUE = 448.f; - const int end_iter = hidden_size / 128; // warp_iter_num - AlignedVector load_vec; - AlignedVector load_vec_float; - AlignedVector res_vec; - for (int token_idx = bid; token_idx < token_num; token_idx += gridDim.x) { - const T *input_now = input + token_idx * hidden_size; - phi::dtype::float8_e4m3fn *quanted_res_now = - quanted_res + token_idx * hidden_size; - // deal a block per warp - for (int iter = warp_id; iter < end_iter; iter += num_warp) { - float *quanted_scale_now = - quanted_scale + iter * padded_token_num + token_idx; - const int start_offset = iter * 128; - Load( - input_now + start_offset + lane_id * NUM_PER_THREADS, &load_vec); - // get max value per thread - float max_value_thread = -5e4; -#pragma unroll - for (int vid = 0; vid < NUM_PER_THREADS; vid++) { - load_vec_float[vid] = static_cast(load_vec[vid]); - max_value_thread = max(abs(load_vec_float[vid]), max_value_thread); - } - // get max value per warp - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 16), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 8), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 4), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 2), - max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 1), - max_value_thread); - // broadcast max_value - max_value_thread = __shfl_sync(0xFFFFFFFF, max_value_thread, 0); - max_value_thread = max(max_value_thread, epsilon); - - if (use_finegrained_range) { - max_value_thread *= 7.0f; - } - - float scale_to_store = max_value_thread / MAX_VALUE; - // quant -#pragma unroll - for (int vid = 0; vid < NUM_PER_THREADS; vid++) { - res_vec[vid] = static_cast( - load_vec_float[vid] * MAX_VALUE / max_value_thread); - } - // store - Store( - res_vec, quanted_res_now + start_offset + lane_id * NUM_PER_THREADS); - if (lane_id == 0) { - *quanted_scale_now = scale_to_store; - } - } - } -} - -std::vector PerTokenQuantPadding(paddle::Tensor &input, - const int block_size) { - using ScaleDtype = float; - - auto input_dim = input.dims(); - const int token_num = input_dim[0]; - const int hidden_size = input_dim[1]; - - PADDLE_ENFORCE(block_size == 128, "now only support block_size = 128"); - PADDLE_ENFORCE(hidden_size % 128 == 0, - "hidden_size must be divisible by 128"); - - const int hidden_size_scale = hidden_size / block_size; - auto quanted_x = GetEmptyTensor( - {token_num, hidden_size}, paddle::DataType::FLOAT8_E4M3FN, input.place()); - - const int tma_alignment_bytes = 16; - const int tma_alignment_elements = tma_alignment_bytes / sizeof(ScaleDtype); - const int padded_token_num = - ((token_num + tma_alignment_elements - 1) / tma_alignment_elements) * - tma_alignment_elements; - auto quanted_scale = GetEmptyTensor({padded_token_num, hidden_size_scale}, - {1, padded_token_num}, - paddle::DataType::FLOAT32, - input.place()); - const int gridx = min(132 * 8, token_num); - const int blockx = min(1024, hidden_size / 128 * 32); - - bool use_finegrained_range = false; - char *env_var = getenv("PER_TOKEN_QUANT_FP8_USE_FINEGRAINED_RANGE"); - if (env_var) { - use_finegrained_range = static_cast(std::stoi(env_var)); - } - - switch (input.dtype()) { - case paddle::DataType::BFLOAT16: - quant_per_token_per_block_padding<<>>( - input.data(), - quanted_x.data(), - quanted_scale.data(), - token_num, - padded_token_num, - hidden_size, - hidden_size_scale, - use_finegrained_range); - break; - case paddle::DataType::FLOAT16: - quant_per_token_per_block_padding<<>>( - input.data(), - quanted_x.data(), - quanted_scale.data(), - token_num, - padded_token_num, - hidden_size, - hidden_size_scale, - use_finegrained_range); - break; - default: - PD_THROW("Unsupported data type for PerTokenQuant"); - } - return {quanted_x, quanted_scale}; -} - -std::vector> PerTokenQuantPaddingInferShape( - std::vector input_shape, const int block_size) { - using ScaleDtype = float; - - const int token_num = input_shape[0]; - const int hidden_size = input_shape[1]; - const int hidden_size_scale = hidden_size / block_size; - - const int tma_alignment_bytes = 16; - const int tma_alignment_elements = tma_alignment_bytes / sizeof(ScaleDtype); - const int padded_token_num = - ((token_num + tma_alignment_elements - 1) / tma_alignment_elements) * - tma_alignment_elements; - - return {{token_num, hidden_size}, {padded_token_num, hidden_size_scale}}; -} - -std::vector PerTokenQuantPaddingInferDtype( - paddle::DataType input_dtype) { - return {paddle::DataType::FLOAT8_E4M3FN, paddle::DataType::FLOAT32}; -} - template __global__ void masked_quant_per_token_per_block( const T *input, @@ -472,22 +165,6 @@ std::vector MaskedPerTokenQuant( return {quanted_x, quanted_scale}; } -PD_BUILD_STATIC_OP(per_token_quant) - .Inputs({"input"}) - .Outputs({"output", "output_scale"}) - .Attrs({"block_size: int"}) - .SetKernelFn(PD_KERNEL(PerTokenQuant)) - .SetInferShapeFn(PD_INFER_SHAPE(PerTokenQuantInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(PerTokenQuantInferDtype)); - -PD_BUILD_STATIC_OP(per_token_quant_padding) - .Inputs({"input"}) - .Outputs({"output", "output_scale"}) - .Attrs({"block_size: int"}) - .SetKernelFn(PD_KERNEL(PerTokenQuantPadding)) - .SetInferShapeFn(PD_INFER_SHAPE(PerTokenQuantPaddingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(PerTokenQuantPaddingInferDtype)); - PD_BUILD_STATIC_OP(masked_per_token_quant) .Inputs({"input", "recv_expert_count"}) .Outputs({"output", "output_scale"}) diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py index 35aa40b77e..9b038bae62 100644 --- a/fastdeploy/model_executor/layers/activation.py +++ b/fastdeploy/model_executor/layers/activation.py @@ -120,6 +120,8 @@ class SiluAndMul(nn.Layer): Returns: Tensor: Output tensor. """ + if self.bias is None and self.quant_scale == -1: + return paddle.nn.functional.swiglu(x) return fused_bias_act( x, bias=self.bias, diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 637e6409b8..fb2a991f78 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -92,12 +92,10 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_custom_python_op( ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, quant_config_weight_block_size_0 + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] ffn_out = paddle.empty( (permute_input.shape[0], layer_added_weight_attrs_1.shape[1]), @@ -239,9 +237,10 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): topk_ids_hookfunc(topk_ids=topk_idx) # 2. Dynamic compute blockwise quantization scales - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, self.quant_config.weight_block_size[0] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False ) + x_scale_tensor = x_scale_tensor[: x.shape[0]] event = deep_ep.Buffer.capture() let_another_thread_run() @@ -317,10 +316,10 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None) # down_proj - ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0] + ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + ffn_out, using_pow2_scale=False ) - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous().transpose([1, 0]) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] del ffn_out ffn_out = paddle.empty( @@ -473,7 +472,12 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) - recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) + recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, + using_pow2_scale=False, + output_scale_transpose=False, + ) + recv_x_scale = recv_x_scale[: recv_x.shape[0]] ( permute_input, diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index c332adf8cc..37b7782105 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -1228,7 +1228,10 @@ def python_op_fused_moe_kernel_paddle( from .triton_moe_kernels import fused_moe_kernel_paddle - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, quant_config.weight_block_size[0]) + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=False + ) + x_scale = x_scale[: x.shape[0]] fused_moe_kernel_paddle[grid]( x_q, @@ -1279,9 +1282,10 @@ def python_op_fused_moe_kernel_paddle( grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),) - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - intermediate_cache2, quant_config.weight_block_size[0] + x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( + intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False ) + x_scale = x_scale[: x_q.shape[0]] fused_moe_kernel_paddle[grid]( x_q, diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index d5af8106ff..a995d18abe 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -18,7 +18,6 @@ from typing import Optional import paddle -import fastdeploy from fastdeploy import envs from fastdeploy.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -264,9 +263,10 @@ class BlockWiseFP8LinearMethod(QuantMethodBase): layer.weight_scale_inv.set_value(weight_scale) def apply(self, layer, x): - x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( - x, self.quant_config.weight_block_size[0] + x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( + x, using_pow2_scale=False, output_scale_transpose=True ) + x_scale_tensor = x_scale_tensor.T linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) linear_out = deep_gemm_fp8_fp8_bf16_nt( x, x_scale_tensor, layer.weight, layer.weight_scale_inv, linear_out, layer.output_size diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index 66c347f680..7f739a8f62 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -236,19 +236,12 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten dtype=x.dtype, ) x_padded[:m, :n] = x - x_view = paddle.view( - x_padded, - (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]), - ) + from paddle.incubate.nn.functional.fp8 import fp8_quant_blockwise - x_abs = paddle.abs(x_view).astype(paddle.float32) - x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) - x_amax = paddle.clip(x_amax, min=1e-4) - x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) - - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) + x_q, scale = fp8_quant_blockwise( + x_padded, quant_method="128x128", input_transpose=False, output_scale_transpose=False, using_pow2_scale=False ) + return x_q[:m, :n].contiguous(), scale def per_token_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]: diff --git a/tests/ce/server/test_logprobs.py b/tests/ce/server/test_logprobs.py index 83ca89486c..3674b3a6b9 100644 --- a/tests/ce/server/test_logprobs.py +++ b/tests/ce/server/test_logprobs.py @@ -25,10 +25,10 @@ def test_unstream_with_logprobs(): # 校验返回内容与概率信息 assert resp_json["choices"][0]["message"]["content"] == "牛顿的" assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], "top_logprobs": None, } @@ -102,10 +102,10 @@ def test_stream_with_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.031025361269712448, + "logprob": -0.03113006055355072, "bytes": [231, 137, 155, 233, 161, 191], } @@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs(): # 校验概率字段 assert result_chunk["choices"][0]["delta"]["content"] == "牛顿" assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿" - assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953 + assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896 assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == { "token": "牛顿", - "logprob": -0.006811376195400953, + "logprob": -0.0068125599063932896, "bytes": [231, 137, 155, 233, 161, 191], } diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 1ee3c99a19..1bdc751eb4 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -205,7 +205,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0113") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0115") else: base_file = "ernie-4_5-vl-base-tp2-dev-0113" with open(base_file, "r") as f: diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index 4e9631e8ed..7f52497e6b 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -204,7 +204,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0113") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0115") else: base_file = "ernie-4_5-vl-base-tp2-dev-0113" with open(base_file, "r") as f: diff --git a/tests/e2e/test_Qwen2_5_VL_serving.py b/tests/e2e/test_Qwen2_5_VL_serving.py index 37362e7532..ff2ae24e20 100644 --- a/tests/e2e/test_Qwen2_5_VL_serving.py +++ b/tests/e2e/test_Qwen2_5_VL_serving.py @@ -179,7 +179,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): f_o.close() # base result - content2 = "这张图片展示了一群人在进行某种活动。前景中有两个孩子和一个成年人,他们似乎在观看或参与某个艺术创作过程。成年人手里拿着一个扇子,上面有各种颜色的颜料,看起来像是在指导孩子们如何使用颜料。孩子们的表情专注,似乎对这个活动很感兴趣。背景中还有其他人在进行类似的活动,环境看起来像是在一个室内空间,可能是教室或工作室。整体氛围显得非常温馨和积极。" + content2 = "这张图片展示了一群人在进行手工艺活动。前景中有两个孩子和一个成年人,他们似乎在制作或展示某种手工艺品。成年人手里拿着一个扇子,上面有彩色的图案,可能是通过某种方式绘制或涂鸦而成。孩子们看起来很专注,可能是在观察或参与这个过程。\n\n背景中还有其他几个人,其中一个人穿着粉色的衣服,背对着镜头。整个场景看起来像是在一个室内环境中,光线充足,氛围轻松愉快。" # Verify that result is same as the base result assert content1 == content2 diff --git a/tests/e2e/test_ernie_21b_mtp.py b/tests/e2e/test_ernie_21b_mtp.py index 3b160bdf57..f5837387de 100644 --- a/tests/e2e/test_ernie_21b_mtp.py +++ b/tests/e2e/test_ernie_21b_mtp.py @@ -339,16 +339,16 @@ def test_mtp_accept_ratio(api_url): print("\nresult:\n", result) base_path = os.getenv("MODEL_PATH") - baseline_path = os.path.join(base_path, "21b_mtp_accept_ratio_baseline.txt") + baseline_path = os.path.join(base_path, "21b_mtp_accept_ratio_baseline_dev.txt") with open(baseline_path, "r", encoding="utf-8") as f: baseline = f.read() baseline_ratio = { - "accepted_tokens": 131, + "accepted_tokens": 139, "rejected_tokens": 23, - "accept_ratio": 0.4122137404580153, - "average_accept_length": 1.7012987012987013, - "accepted_tokens_per_head": [77, 54], - "accept_ratio_per_head": [0.7012987012987013], + "accept_ratio": 0.41726618705035967, + "average_accept_length": 1.7160493827160495, + "accepted_tokens_per_head": [81, 58], + "accept_ratio_per_head": [0.7160493827160493], } response = send_request(url=api_url, payload=payload) diff --git a/tests/e2e/utils/rollout_routing_repaly_test_utils.py b/tests/e2e/utils/rollout_routing_repaly_test_utils.py index 499bbbed68..6e2f82a5c4 100644 --- a/tests/e2e/utils/rollout_routing_repaly_test_utils.py +++ b/tests/e2e/utils/rollout_routing_repaly_test_utils.py @@ -151,9 +151,9 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/" model_path = os.getenv("MODEL_PATH") if model_path: - baseline_path = os.path.join(model_path, f"R3_BaseLine/routing_replay_output_baseline_{model_name}") + baseline_path = os.path.join(model_path, f"R3_BaseLine_dev/routing_replay_output_baseline_{model_name}") else: - baseline_path = f"./R3_BaseLine/routing_replay_output_baseline_{model_name}" + baseline_path = f"./R3_BaseLine_dev/routing_replay_output_baseline_{model_name}" stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream") diff --git a/tests/layers/test_activation.py b/tests/layers/test_activation.py index 70f011d396..b564c26752 100644 --- a/tests/layers/test_activation.py +++ b/tests/layers/test_activation.py @@ -84,8 +84,11 @@ class TestSiluAndMul(unittest.TestCase): layer = SiluAndMul(fd_config) x = paddle.ones([2, 2]) out = layer.forward(x) - self.assertTrue((out.numpy() == 1).all()) - mock_fused.assert_called_once() + if layer.bias is None and layer.quant_scale == -1: + self.assertTrue((out.numpy() == 0.73105854).all()) + else: + self.assertTrue((out.numpy() == 1).all()) + mock_fused.assert_called_once() # Test forward computation on GCU platform @patch( diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index bc8252a442..7ab1533f10 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -140,7 +140,7 @@ def test_model_against_baseline( # Get baseline suffix from config model_config = hugging_face_model_param_map.get(model_name_or_path, {}) - baseline_suffix = model_config.get("baseline_suffix", "tp2") + baseline_suffix = model_config.get("baseline_suffix", "tp2-dev") baseline_filename = f"{model_name_or_path}-{baseline_suffix}" if base_path: diff --git a/tests/operators/test_per_token_quant.py b/tests/operators/test_per_token_quant.py deleted file mode 100644 index 23972ce53c..0000000000 --- a/tests/operators/test_per_token_quant.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest - -import numpy as np -import paddle -import paddle.nn.functional as F - -from fastdeploy.model_executor.ops.gpu import per_token_quant, per_token_quant_padding - -paddle.seed(2024) - - -def per_token_quant_paddle(input_tensor, block_size): - MAX_VALUE = 448.0 - epsilon = 1e-10 - - input_shape = input_tensor.shape - token_num = input_shape[0] - hidden_size = input_shape[1] - - # According to https://github.com/PaddlePaddle/FastDeploy/pull/3659 - padding_size = (block_size - hidden_size % block_size) % block_size - - padded_input = input_tensor - if padding_size > 0: - padded_input = F.pad(input_tensor, pad=[0, padding_size], mode="constant", value=0.0) - - padded_hidden_size = hidden_size + padding_size - hidden_size_scale = padded_hidden_size // block_size - - reshaped_input = paddle.reshape(padded_input, [token_num, hidden_size_scale, block_size]).astype("float32") - - max_abs_val = paddle.max(paddle.abs(reshaped_input), axis=-1, keepdim=True) - max_abs_val = paddle.clip(max_abs_val, min=epsilon) - scale = max_abs_val / MAX_VALUE - - quanted_value = reshaped_input / scale - - quanted_x_padded_reshaped = quanted_value.to(paddle.float8_e4m3fn) - quanted_x_padded = paddle.reshape(quanted_x_padded_reshaped, [token_num, padded_hidden_size]) - - quanted_x = quanted_x_padded[:, :hidden_size] - - quanted_scale = paddle.squeeze(scale, axis=-1) - - return quanted_x, quanted_scale - - -def per_token_quant_padding_paddle(input_tensor, block_size, dtype): - quanted_x, intermediate_scale = per_token_quant_paddle(input_tensor, block_size) - token_num = input_tensor.shape[0] - - tma_alignment_elements = 4 - padded_token_num = ((token_num + tma_alignment_elements - 1) // tma_alignment_elements) * tma_alignment_elements - - hidden_size_scale = intermediate_scale.shape[1] - padded_scale = paddle.zeros([padded_token_num, hidden_size_scale], dtype="float32") - - padded_scale[:token_num, :] = intermediate_scale - - return quanted_x, padded_scale - - -class TestPerTokenQuant(unittest.TestCase): - def get_input(self, shape, dtype): - return paddle.randn(shape=shape, dtype=dtype) - - def setUp(self) -> None: - self.dtype = paddle.float16 - self.token_num = 4 - self.hidden_size = 500 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - def test_per_token_quant(self): - paddle_output, paddle_output_scale = per_token_quant_paddle(self.input_tensor, self.block_size) - output, output_scale = per_token_quant(self.input_tensor, self.block_size) - - np.testing.assert_allclose(paddle_output_scale.numpy(), output_scale.numpy(), rtol=1e-6) - - output_rel_diff = paddle.mean( - paddle.abs(output.to(paddle.float32) - paddle_output.to(paddle.float32)) - ) / paddle.mean(paddle.abs(paddle_output.to(paddle.float32))) - - assert output_rel_diff < 0.001 - - -class TestPerTokenQuantCase1(TestPerTokenQuant): - def setUp(self) -> None: - self.dtype = paddle.float16 - self.token_num = 4 - self.hidden_size = 128 * 6 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - -class TestPerTokenQuantCase2(TestPerTokenQuant): - def setUp(self) -> None: - self.dtype = paddle.bfloat16 - self.token_num = 4 - self.hidden_size = 500 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - -class TestPerTokenQuantCase3(TestPerTokenQuant): - def setUp(self) -> None: - self.dtype = paddle.bfloat16 - self.token_num = 4 - self.hidden_size = 128 * 6 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - -class TestPerTokenQuantPadding(TestPerTokenQuant): - def setUp(self) -> None: - self.dtype = paddle.float16 - self.token_num = 6 - self.hidden_size = 128 * 4 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - def test_per_token_quant_padding(self): - paddle_output, paddle_output_scale = per_token_quant_padding_paddle( - self.input_tensor, self.block_size, self.dtype - ) - output, output_scale = per_token_quant_padding(self.input_tensor, self.block_size) - - self.assertEqual(paddle_output_scale.shape, output_scale.shape) - np.testing.assert_allclose( - paddle_output_scale[0 : self.token_num].numpy(), - output_scale[0 : self.token_num].numpy(), - rtol=1e-5, - atol=1e-5, - ) - - output_rel_diff = paddle.mean( - paddle.abs(output.to(paddle.float32) - paddle_output.to(paddle.float32)) - ) / paddle.mean(paddle.abs(paddle_output.to(paddle.float32)) + 1e-9) - - assert output_rel_diff < 0.001 - - -class TestPerTokenQuantPaddingCase1(TestPerTokenQuantPadding): - def setUp(self) -> None: - self.dtype = paddle.float16 - self.token_num = 8 - self.hidden_size = 128 * 4 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - -class TestPerTokenQuantPaddingCase2(TestPerTokenQuantPadding): - def setUp(self) -> None: - self.dtype = paddle.bfloat16 - self.token_num = 6 - self.hidden_size = 128 * 4 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - -class TestPerTokenQuantPaddingCase3(TestPerTokenQuantPadding): - def setUp(self) -> None: - self.dtype = paddle.bfloat16 - self.token_num = 8 - self.hidden_size = 128 * 4 - self.block_size = 128 - self.input_tensor = self.get_input(shape=[self.token_num, self.hidden_size], dtype=self.dtype) - - -if __name__ == "__main__": - unittest.main()