Merge branch 'develop' into gbd_android

2026-05-07 16:08:58 +08:00 · 2022-11-04 21:23:26 +08:00
parent 3fd54d6884 e453902809
commit d22b8d9e3d
196 changed files with 4759 additions and 1684 deletions
@@ -38,3 +38,5 @@ coverage
 *.local
 yalc.*
 .yalc
+examples/vision/collect_quantize_cc.sh
+examples/vision/tests_quantize
@@ -50,7 +50,6 @@ if(ANDROID)
 endif()

 ############################# Basic Options for FastDeploy ################################
-option(ENABLE_PADDLE_FRONTEND "Whether to enable PaddlePaddle frontend to support load paddle model in fastdeploy." ON)
 option(WITH_GPU "Whether WITH_GPU=ON, will enable onnxruntime-gpu/paddle-infernce-gpu/poros-gpu" OFF)
 option(WITH_IPU "Whether WITH_IPU=ON, will enable paddle-infernce-ipu" OFF)
 option(ENABLE_ORT_BACKEND "Whether to enable onnxruntime backend." OFF)
@@ -190,13 +189,8 @@ if(WITH_SW)
  add_definitions(-DEIGEN_AVOID_THREAD_LOCAL)
 endif()

-if(ENABLE_PADDLE_FRONTEND)
-  add_definitions(-DENABLE_PADDLE_FRONTEND)
-  include(${PROJECT_SOURCE_DIR}/cmake/paddle2onnx.cmake)
-  list(APPEND DEPEND_LIBS external_paddle2onnx)
-endif(ENABLE_PADDLE_FRONTEND)
-
 if(ENABLE_ORT_BACKEND)
+  set(ENABLE_PADDLE_FRONTEND ON)
  add_definitions(-DENABLE_ORT_BACKEND)
  list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS})
  include(${PROJECT_SOURCE_DIR}/cmake/onnxruntime.cmake)
@@ -224,6 +218,7 @@ if(ENABLE_PADDLE_BACKEND)
 endif()

 if(ENABLE_OPENVINO_BACKEND)
+  set(ENABLE_PADDLE_FRONTEND ON)
  add_definitions(-DENABLE_OPENVINO_BACKEND)
  list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_OPENVINO_SRCS})
  include(${PROJECT_SOURCE_DIR}/cmake/openvino.cmake)
@@ -329,6 +324,7 @@ if(WITH_IPU)
 endif()

 if(ENABLE_TRT_BACKEND)
+  set(ENABLE_PADDLE_FRONTEND ON)
  if(APPLE OR ANDROID OR IOS)
    message(FATAL_ERROR "Cannot enable tensorrt backend in mac/ios/android os, please set -DENABLE_TRT_BACKEND=OFF.")
  endif()
@@ -382,7 +378,6 @@ endif()

 if(ENABLE_VISION)
  add_definitions(-DENABLE_VISION)
-#  set(ENABLE_VISION_VISUALIZE ON)
  add_definitions(-DENABLE_VISION_VISUALIZE)
  if(ENABLE_OPENCV_CUDA)
    if(NOT WITH_GPU)
@@ -424,6 +419,13 @@ if(ENABLE_TEXT)
  include(${PROJECT_SOURCE_DIR}/cmake/faster_tokenizer.cmake)
 endif()

+if(ENABLE_PADDLE_FRONTEND)
+  add_definitions(-DENABLE_PADDLE_FRONTEND)
+  include(${PROJECT_SOURCE_DIR}/cmake/paddle2onnx.cmake)
+  list(APPEND DEPEND_LIBS external_paddle2onnx)
+endif(ENABLE_PADDLE_FRONTEND)
+
+
 configure_file(${PROJECT_SOURCE_DIR}/FastDeploy.cmake.in ${PROJECT_SOURCE_DIR}/FastDeploy.cmake @ONLY)
 configure_file(${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py.in ${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py)
 configure_file(${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py.in ${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py)
@@ -30,22 +30,22 @@ English | [简体中文](README_CN.md)

 ## 📣 Recent Updates

- 🔥 **2022.10.15：Release FastDeploy [release v0.3.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.3.0)** <br>
+- 🔥 **2022.10.31：Release FastDeploy [release v0.5.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.5.0)** <br>
+    -  **New deployment upgrade: Support support more backend, support more cv models**
+        -  Support Paddle Inference TensorRT, and provide a seamless deployment experience with other inference engines include TensorRT、OpenVINO、ONNX Runtime、Paddle Lite、Paddle Inference；
+        -  Support Graphcore IPU through paddle Inference;
+        -  Support tracking model [PP-Tracking](./examples/vision/tracking/pptracking) and [RobustVideoMatting](./examples/vision/matting) model
+
+- 🔥 **2022.10.24：Release FastDeploy [release v0.4.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.4.0)** <br>
  - **New server-side deployment upgrade: support more CV model and NLP model**
-       - Integrate OpenVINO and provide a seamless deployment experience with other inference engines include TensorRT、ONNX Runtime、Paddle Inference；
-       - Support [one-click model quantization](tools/quantization) to improve model inference speed by 1.5 to 2 times on CPU & GPU platform. The supported quantized model are YOLOv7, YOLOv6, YOLOv5, etc. 
+       - Integrate Paddle Lite and provide a seamless deployment experience with other inference engines include TensorRT、OpenVINO、ONNX Runtime、Paddle Inference；
+       - Support [Lightweight Detection Model](examples/vision/detection/paddledetection/android) and [classification model](examples/vision/classification/paddleclas/android) on Android Platform，Download to try it out
+       - end-to-end optimization on GPU, [YOLO series](examples/vision/detection) model end-to-end inference speedup from 43ms to 25ms;<br>
+       - Web deployment and Mini Program deployment New [OCR and other CV models](examples/application/js) capability；
+       - Support [TinyPose](examples/vision/keypointdetection/tiny_pose) and [PicoDet+TinyPose](examples/vision/keypointdetection/det_keypoint_unite)Pipeline deployment；
       - New CV models include PP-OCRv3, PP-OCRv2, PP-TinyPose, PP-Matting, etc. and provides [end-to-end deployment demos](examples/vision/detection/)
       - New information extraction model is UIE, and provides [end-to-end deployment demos](examples/text/uie).

- 🔥 **2022.8.18：Release FastDeploy [release v0.2.0](https://github.com/PaddlePaddle/FastDeploy/tree/release%2F0.2.0)** <br>
-  - **New server-side deployment upgrade: faster inference performance, support more CV model**
-    - Release high-performance inference engine SDK based on x86 CPUs and NVIDIA GPUs, with significant increase in inference speed
-    - Integrate Paddle Inference, ONNX Runtime, TensorRT and other inference engines and provide a seamless deployment experience
-    - Supports full range of object detection models such as YOLOv7, YOLOv6, YOLOv5, PP-YOLOE and provides [end-to-end deployment demos](examples/vision/detection/)
-    - Support over 40 key models and [demo examples](examples/vision/) including face detection, face recognition, real-time portrait matting, image segmentation.
-    - Support deployment in both Python and C++
-  - **Supports Rockchip, Amlogic, NXP and other NPU chip deployment capabilities on edge device deployment**
-    - Release Lightweight Object Detection [Picodet-NPU deployment demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/develop/object_detection/linux/picodet_detection), providing the full quantized inference capability for INT8.

 ## Contents

@@ -197,6 +197,7 @@ Notes: ✅: already supported; ❔: to be supported in the future; ❌: not supp
 | <font size=2> Detection | <font size=2> [PaddleDetection/PP-YOLO](./examples/vision/detection/paddledetection) | <font size=2> [Python](./examples/vision/detection/paddledetection/python)/[C++](./examples/vision/detection/paddledetection/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ❌ | ❌ | ❌ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [PaddleDetection/PP-YOLOv2](./examples/vision/detection/paddledetection) | <font size=2> [Python](./examples/vision/detection/paddledetection/python)/[C++](./examples/vision/detection/paddledetection/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ❌ | ❌ | ❌ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [PaddleDetection/FasterRCNN](./examples/vision/detection/paddledetection) | <font size=2> [Python](./examples/vision/detection/paddledetection/python)/[C++](./examples/vision/detection/paddledetection/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ | ❌ | ❌ | ❌ | ❌ |❔|❔|
+| <font size=2> Detection | <font size=2> [PaddleDetection/PP-Tracking](./examples/vision/tracking/pptracking) | <font size=2> [Python](./examples/vision/tracking/pptracking/python)/[C++](./examples/vision/tracking/pptracking/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ | ❌ | ❌ | ❌ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [Megvii-BaseDetection/YOLOX](./examples/vision/detection/yolox) | <font size=2> [Python](./examples/vision/detection/yolox/python)/[C++](./examples/vision/detection/yolox/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [WongKinYiu/YOLOv7](./examples/vision/detection/yolov7) | <font size=2> [Python](./examples/vision/detection/yolov7/python)/[C++](./examples/vision/detection/yolov7/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [meituan/YOLOv6](./examples/vision/detection/yolov6) | <font size=2> [Python](./examples/vision/detection/yolov6/python)/[C++](./examples/vision/detection/yolov6/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
@@ -222,10 +223,13 @@ Notes: ✅: already supported; ❔: to be supported in the future; ❌: not supp
 | <font size=2> Face Recognition | <font size=2> [insightface/PartialFC](./examples/vision/faceid/insightface) | <font size=2> [Python](./examples/vision/faceid/insightface/python)/[C++](./examples/vision/faceid/insightface/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Face Recognition | <font size=2> [insightface/VPL](./examples/vision/faceid/insightface) | <font size=2> [Python](./examples/vision/faceid/insightface/python)/[C++](./examples/vision/faceid/insightface/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [ZHKKKe/MODNet](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/modnet/python)/[C++](./examples/vision/matting/modnet/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
+| <font size=2> Matting | <font size=2> [eterL1n/RobustVideoMatting](./examples/vision/matting/rvm) | <font size=2> [Python](./examples/vision/matting/rvm/python)/[C++](./examples/vision/matting/rvm/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/PP-Matting](./examples/vision/matting/ppmatting) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/ModNet](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Information Extraction | <font size=2> [PaddleNLP/UIE](./examples/text/uie) | <font size=2> [Python](./examples/text/uie/python)/[C++](./examples/text/uie/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
+| <font size=2>Text Classification | <font size=2> [PaddleNLP/Ernie-3.0](./examples/text/ernie-3.0) | <font size=2> Python/C++ |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
+| <font size=2> Text-to-Speech| <font size=2> [PaddleSpeech/PP-TTS](./examples/audio/pp-tts) | <font size=2> [Python](./examples/audio/pp-tts/python)/C++ |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|

 ## Edge-Side Deployment

@@ -50,16 +50,16 @@ def parse_arguments():
    parser.add_argument(
        "--backend",
        type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
    parser.add_argument(
        "--enable_trt_fp16",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable fp16 in trt backend")
    parser.add_argument(
        "--enable_collect_memory_info",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable collect memory info")
    args = parser.parse_args()
@@ -70,26 +70,43 @@ def build_option(args):
    option = fd.RuntimeOption()
    device = args.device
    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
    option.set_cpu_thread_num(args.cpu_num_thread)
    if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
    else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))

    return option

@@ -123,6 +140,7 @@ if __name__ == '__main__':
    config_file = os.path.join(args.model, "inference_cls.yaml")

    gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -149,7 +167,7 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                gpu_util.append(get_current_gputil(gpu_id))
                cm, gm = get_current_memory_mb(gpu_id)
                cpu_mem.append(cm)
@@ -159,7 +177,7 @@ if __name__ == '__main__':

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            cpu_mem_repeat = cpu_mem[warmup_iter:]
            gpu_mem_repeat = gpu_mem[warmup_iter:]
            gpu_util_repeat = gpu_util[warmup_iter:]
@@ -167,14 +185,14 @@ if __name__ == '__main__':
        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
            f.writelines("gpu_rss_mb: {} \n".format(
@@ -52,16 +52,16 @@ def parse_arguments():
    parser.add_argument(
        "--backend",
        type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
    parser.add_argument(
        "--enable_trt_fp16",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable fp16 in trt backend")
    parser.add_argument(
        "--enable_collect_memory_info",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable collect memory info")
    args = parser.parse_args()
@@ -72,26 +72,43 @@ def build_option(args):
    option = fd.RuntimeOption()
    device = args.device
    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
    option.set_cpu_thread_num(args.cpu_num_thread)
    if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
    else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))

    return option

@@ -125,6 +142,7 @@ if __name__ == '__main__':
    config_file = os.path.join(args.model, "infer_cfg.yml")

    gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -169,7 +187,7 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                gpu_util.append(get_current_gputil(gpu_id))
                cm, gm = get_current_memory_mb(gpu_id)
                cpu_mem.append(cm)
@@ -179,7 +197,7 @@ if __name__ == '__main__':

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            cpu_mem_repeat = cpu_mem[warmup_iter:]
            gpu_mem_repeat = gpu_mem[warmup_iter:]
            gpu_util_repeat = gpu_util[warmup_iter:]
@@ -187,14 +205,14 @@ if __name__ == '__main__':
        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
            f.writelines("gpu_rss_mb: {} \n".format(
@@ -50,16 +50,16 @@ def parse_arguments():
    parser.add_argument(
        "--backend",
        type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
    parser.add_argument(
        "--enable_trt_fp16",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable fp16 in trt backend")
    parser.add_argument(
        "--enable_collect_memory_info",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable collect memory info")
    args = parser.parse_args()
@@ -70,26 +70,43 @@ def build_option(args):
    option = fd.RuntimeOption()
    device = args.device
    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
    option.set_cpu_thread_num(args.cpu_num_thread)
    if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
    else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))

    return option

@@ -123,6 +140,7 @@ if __name__ == '__main__':
    config_file = os.path.join(args.model, "deploy.yaml")

    gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -148,7 +166,7 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                gpu_util.append(get_current_gputil(gpu_id))
                cm, gm = get_current_memory_mb(gpu_id)
                cpu_mem.append(cm)
@@ -158,7 +176,7 @@ if __name__ == '__main__':

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            cpu_mem_repeat = cpu_mem[warmup_iter:]
            gpu_mem_repeat = gpu_mem[warmup_iter:]
            gpu_util_repeat = gpu_util[warmup_iter:]
@@ -166,14 +184,14 @@ if __name__ == '__main__':
        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
            f.writelines("gpu_rss_mb: {} \n".format(
@@ -52,16 +52,16 @@ def parse_arguments():
    parser.add_argument(
        "--backend",
        type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
    parser.add_argument(
        "--enable_trt_fp16",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable fp16 in trt backend")
    parser.add_argument(
        "--enable_collect_memory_info",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable collect memory info")
    args = parser.parse_args()
@@ -72,26 +72,43 @@ def build_option(args):
    option = fd.RuntimeOption()
    device = args.device
    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
    option.set_cpu_thread_num(args.cpu_num_thread)
    if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
    else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))

    return option

@@ -123,6 +140,7 @@ if __name__ == '__main__':
    model_file = args.model

    gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
    end2end_statis = list()
    cpu_mem = list()
    gpu_mem = list()
@@ -161,7 +179,7 @@ if __name__ == '__main__':
            start = time.time()
            result = model.predict(im)
            end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                gpu_util.append(get_current_gputil(gpu_id))
                cm, gm = get_current_memory_mb(gpu_id)
                cpu_mem.append(cm)
@@ -171,7 +189,7 @@ if __name__ == '__main__':

        warmup_iter = args.iter_num // 5
        end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            cpu_mem_repeat = cpu_mem[warmup_iter:]
            gpu_mem_repeat = gpu_mem[warmup_iter:]
            gpu_util_repeat = gpu_util[warmup_iter:]
@@ -179,14 +197,14 @@ if __name__ == '__main__':
        dump_result = dict()
        dump_result["runtime"] = runtime_statis["avg_time"] * 1000
        dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
            dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
            dump_result["gpu_util"] = np.mean(gpu_util_repeat)

        f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
        f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
            f.writelines("cpu_rss_mb: {} \n".format(
                str(dump_result["cpu_rss_mb"])))
            f.writelines("gpu_rss_mb: {} \n".format(
@@ -38,36 +38,71 @@ ENDIF(WIN32)

 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})

-ExternalProject_Add(
-    extern_gflags
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
-    GIT_TAG         ${GFLAGS_TAG}
-    PREFIX          ${GFLAGS_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    BUILD_COMMAND   ${BUILD_COMMAND}
-    INSTALL_COMMAND ${INSTALL_COMMAND}
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DBUILD_STATIC_LIBS=ON
-                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
-)
-
+if(ANDROID)
+  ExternalProject_Add(
+      extern_gflags
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
+      GIT_TAG         ${GFLAGS_TAG}
+      PREFIX          ${GFLAGS_PREFIX_DIR}
+      UPDATE_COMMAND  ""
+      BUILD_COMMAND   ${BUILD_COMMAND}
+      INSTALL_COMMAND ${INSTALL_COMMAND}
+      CMAKE_ARGS      -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+                      -DANDROID_ABI=${ANDROID_ABI}
+                      -DANDROID_NDK=${ANDROID_NDK}
+                      -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                      -DANDROID_STL=c++_static
+                      -DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN}
+                      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                      -DBUILD_STATIC_LIBS=ON
+                      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                      -DBUILD_TESTING=OFF
+                      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                      ${EXTERNAL_OPTIONAL_ARGS}
+     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}                
+      BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
+  )
+else()
+  ExternalProject_Add(
+      extern_gflags
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
+      GIT_TAG         ${GFLAGS_TAG}
+      PREFIX          ${GFLAGS_PREFIX_DIR}
+      UPDATE_COMMAND  ""
+      BUILD_COMMAND   ${BUILD_COMMAND}
+      INSTALL_COMMAND ${INSTALL_COMMAND}
+      CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                      -DBUILD_STATIC_LIBS=ON
+                      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                      -DBUILD_TESTING=OFF
+                      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                      ${EXTERNAL_OPTIONAL_ARGS}
+      CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+      BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
+  )
+endif()
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
@@ -43,13 +43,14 @@ else()
 endif(WIN32)

 set(PADDLE2ONNX_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
-set(PADDLE2ONNX_VERSION "1.0.1")
+set(PADDLE2ONNX_VERSION "1.0.2rc")
 if(WIN32)
  set(PADDLE2ONNX_FILE "paddle2onnx-win-x64-${PADDLE2ONNX_VERSION}.zip")
  if(NOT CMAKE_CL_64)
    set(PADDLE2ONNX_FILE "paddle2onnx-win-x86-${PADDLE2ONNX_VERSION}.zip")
  endif()
 elseif(APPLE)
+  set(PADDLE2ONNX_VERSION "1.0.1")
  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
    set(PADDLE2ONNX_FILE "paddle2onnx-osx-arm64-${PADDLE2ONNX_VERSION}.tgz")
  else()
@@ -14,3 +14,4 @@ FastDeploy根据视觉模型的任务类型，定义了不同的结构体(`fastd
 | MattingResult           | [C++/Python文档](./matting_result.md)           | 图片/视频抠图返回结果      | MODNet、RVM系列模型等         |
 | OCRResult               | [C++/Python文档](./ocr_result.md)               | 文本框检测，分类和文本识别返回结果 | OCR系列模型等                |
 | MOTResult               | [C++/Python文档](./mot_result.md)               | 多目标跟踪返回结果         | pptracking系列模型等         |
+| HeadPoseResult               | [C++/Python文档](./headpose_result.md)               | 头部姿态估计返回结果         | FSANet系列模型等         |
@@ -0,0 +1,25 @@
+# HeadPoseResult 头部姿态结果
+
+HeadPoseResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明头部姿态结果。
+
+## C++ 定义
+
+`fastdeploy::vision::HeadPoseResult`
+
+```c++
+struct HeadPoseResult {
+  std::vector<float> euler_angles;
+  void Clear();
+  std::string Str();
+};
+```
+
+- **euler_angles**: 成员变量，表示单张人脸图片预测的欧拉角，存放的顺序是(yaw, pitch, roll)， yaw 代表水平转角，pitch 代表垂直角，roll 代表翻滚角，值域都为 [-90,+90]度
+- **Clear()**: 成员函数，用于清除结构体中存储的结果
+- **Str()**: 成员函数，将结构体中的信息以字符串形式输出（用于Debug）
+
+## Python 定义
+
+`fastdeploy.vision.HeadPoseResult`
+
+- **euler_angles**(list of float): 成员变量，表示单张人脸图片预测的欧拉角，存放的顺序是(yaw, pitch, roll)， yaw 代表水平转角，pitch 代表垂直角，roll 代表翻滚角，值域都为 [-90,+90]度
@@ -37,4 +37,3 @@ fastdeploy.vision.MOTResult
 - **ids**(list of list(float)):成员变量，表示单帧画面中所有目标的id，其元素个数与`boxes`一致
 - **scores**(list of float): 成员变量，表示单帧画面检测出来的所有目标置信度
 - **class_ids**(list of int): 成员变量，表示单帧画面出来的所有目标类别
-
@@ -16,6 +16,7 @@ API:`fastdeploy.vision.SegmentationResult`, 该结果返回:
 - **score_map**(list of float): 成员变量，与label_map一一对应的所预测的分割类别概率值(当导出模型时指定`--output_op argmax`)或者经过softmax归一化化后的概率值(当导出模型时指定`--output_op softmax`或者导出模型时指定`--output_op none`同时模型初始化的时候设置模型类成员属性`apply_softmax=true`).
 - **shape**(list of int): 成员变量，表示输出图片的尺寸，为`H*W`.

+
 ## DetectionResult
 DetectionResult代码定义在`fastdeploy/vision/common/result.h`中，用于表明图像检测出来的目标框、目标类别和目标置信度.

@@ -40,6 +41,7 @@ API:`fastdeploy.vision.FaceDetectionResult` , 该结果返回:
 - **landmarks**(list of list(float)): 成员变量，表示单张图片检测出来的所有人脸的关键点.
 - **landmarks_per_face**(int): 成员变量，表示每个人脸框中的关键点的数量.

+
 ## KeyPointDetectionResult
 KeyPointDetectionResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明图像中目标行为的各个关键点坐标和置信度。

@@ -70,6 +72,7 @@ API:`fastdeploy.vision.MattingResult`, 该结果返回:
 - **contain_foreground**(bool): 表示预测的结果是否包含前景.
 - **shape**(list of int): 表示输出结果的shape，当`contain_foreground`为`false`，shape只包含`(H,W)`，当`contain_foreground`为`true`，shape包含`(H,W,C)`, C一般为3.

+
 ## OCRResult
 OCRResult代码定义在`fastdeploy/vision/common/result.h`中，用于表明图像检测和识别出来的文本框，文本框方向分类，以及文本框内的文本内容.

@@ -79,3 +82,17 @@ API:`fastdeploy.vision.OCRResult`, 该结果返回:
 - **rec_scores**(list of float): 成员变量，表示文本框内识别出来的文本的置信度，其元素个数与`boxes.size()`一致.
 - **cls_scores**(list of float): 成员变量，表示文本框的分类结果的置信度，其元素个数与`boxes.size()`一致.
 - **cls_labels**(list of int): 成员变量，表示文本框的方向分类类别，其元素个数与`boxes.size()`一致.
+
+
+## FaceAlignmentResult
+FaceAlignmentResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明人脸landmarks。
+
+API:`fastdeploy.vision.FaceAlignmentResult`, 该结果返回:
+- **landmarks**(list of list(float)): 成员变量，表示单张人脸图片检测出来的所有关键点
+
+
+## HeadPoseResult
+HeadPoseResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明头部姿态结果。
+
+API:`fastdeploy.vision.HeadPoseResult`, 该结果返回:
+- **euler_angles**(list of float): 成员变量，表示单张人脸图片预测的欧拉角，存放的顺序是(yaw, pitch, roll)， yaw 代表水平转角，pitch 代表垂直角，roll 代表翻滚角，值域都为 [-90, +90]度
@@ -10,6 +10,7 @@ API: `fastdeploy.vision.ClassifyResult`, The ClassifyResult will return:

 - **scores**(list of float):Member variables that indicate the confidence level of a single image on the corresponding classification result, the number of which is determined by the  `topk` passed in when using the classification model, e.g. the confidence level of a Top 5 classification can be returned.

+
 ## SegmentationResult
 The code of SegmentationResult is defined in `fastdeploy/vision/common/result.h` and is used to indicate the segmentation category predicted for each pixel in the image and the probability of the segmentation category.

@@ -33,6 +34,7 @@ API: `fastdeploy.vision.Mask`, The Mask will return:
 - **data**:Member variable indicating a detected mask.
 - **shape**:Member variable representing the shape of the mask, e.g.  `(H,W)`.

+
 ## FaceDetectionResult
 The FaceDetectionResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the target frames detected by face detection, face landmarks, target confidence and the number of landmarks per face.

@@ -42,6 +44,7 @@ API: `fastdeploy.vision.FaceDetectionResult`, The FaceDetectionResult will retur
 - **landmarks**(list of list(float)): Member variables that represent the key points of all faces detected by a single image.
 - **landmarks_per_face**(int):Member variable indicating the number of key points in each face frame.

+
 ## KeyPointDetectionResult
 The KeyPointDetectionResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the coordinates and confidence of each keypoint of the target behavior in the image.

@@ -55,12 +58,14 @@ API:`fastdeploy.vision.KeyPointDetectionResult`, The KeyPointDetectionResult wil
    - `J`: num_joints（number of keypoints for a target）
 - **num_joints**(int): Member variable, representing the number of keypoints for a target

+
 ## FaceRecognitionResult
 The FaceRecognitionResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the embedding of the image features by the face recognition model.

 API: `fastdeploy.vision.FaceRecognitionResult`, The FaceRecognitionResult will return:
 - **landmarks_per_face**(list of float):Member variables, which indicate the final extracted features embedding of the face recognition model, can be used to calculate the feature similarity between faces.

+
 ## MattingResult
 The MattingResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the value of alpha transparency predicted by the model, the predicted outlook, etc.

@@ -70,6 +75,7 @@ API:`fastdeploy.vision.MattingResult`, The MattingResult will return:
 - **contain_foreground**(bool):Indicates whether the predicted outcome includes the foreground.
 - **shape**(list of int): When `contain_foreground` is false, the shape only contains `(H,W)`, when `contain_foreground` is `true,` the shape contains `(H,W,C)`, C is generally 3.

+
 ## OCRResult
 The OCRResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the text box detected in the image, the text box orientation classification, and the text content recognized inside the text box.

@@ -79,3 +85,17 @@ API:`fastdeploy.vision.OCRResult`, The OCRResult will return:
 - **rec_scores**(list of float):Member variable indicating the confidence level of the text identified in the box, the number of elements is the same as `boxes.size()`.
 - **cls_scores**(list of float):Member variable indicating the confidence level of the classification result of the text box, with the same number of elements as `boxes.size()`.
 - **cls_labels**(list of int):Member variable indicating the orientation category of the text box, the number of elements is the same as `boxes.size()`.
+
+
+## FaceAlignmentResult
+The code of FaceAlignmentResult is defined in `fastdeploy/vision/common/result.h` and is used to indicate the key points of the face.
+
+API: `fastdeploy.vision.FaceAlignmentResult`, The FaceAlignmentResult will return:
+- **landmarks**(list of list(float)):Member variables that represent the all key points detected from a single face image.
+
+
+## HeadPoseResult
+The code of HeadPoseResult is defined in `fastdeploy/vision/common/result.h` and is used to indicate the head pose result.
+
+API: `fastdeploy.vision.HeadPoseResult`, The HeadPoseResult will return:
+- **euler_angles**(list of float):Member variables that represent the Euler angle predicted by a single face image. The storage order is (yaw, pitch, roll), yaw represents the horizontal angle, pitch represents the vertical angle, roll represents the roll angle, and the value range is [-90, +90] Spend.
@@ -1,5 +1,8 @@
 # RKNPU2模型部署

+## 安装环境
+RKNPU2模型导出只支持在x86Linux平台上进行导出，安装流程请参考[RKNPU2模型导出环境配置文档](./install_rknn_toolkit2.md)
+
 ## ONNX模型转换为RKNN模型
 ONNX模型不能直接调用RK芯片中的NPU进行运算，需要把ONNX模型转换为RKNN模型，具体流程请查看[转换文档](./export.md)

@@ -61,4 +64,3 @@ int infer_scrfd_npu() {
 - [rknpu2板端环境安装配置](../../build_and_install/rknpu2.md)
 - [rknn_toolkit2安装文档](./install_rknn_toolkit2.md)
 - [onnx转换rknn文档](./export.md)
-
@@ -2,22 +2,22 @@

 # 量化加速
 量化是一种流行的模型压缩方法，量化后的模型拥有更小的体积和更快的推理速度.
-FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDeploy支持推理部署量化后的模型, 帮助用户实现推理加速.
+FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具. FastDeploy一键模型自动压缩可包含多种策略, 目前主要采用离线量化和量化蒸馏训练.同时, FastDeploy支持部署压缩后的模型, 帮助用户实现推理加速. 本文主要描述量化模型在FastDeploy上的部署.


 ## FastDeploy 多个引擎和硬件支持量化模型部署
 当前，FastDeploy中多个推理后端可以在不同硬件上支持量化模型的部署. 支持情况如下:

-| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT |
-| :-----------| :--------   | :--------------- | :------- |
-|   CPU       |  支持        |  支持            |          |  
-|   GPU       |             |                  | 支持      |
+| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT | Paddle-TensorRT |
+| :-----------| :--------   | :--------------- | :------- | :------- |
+|   CPU       |  支持        |  支持            |          |           |  
+|   GPU       |             |                  | 支持      |    支持        |


 ## 模型量化

 ### 量化方法
-基于PaddleSlim，目前FastDeploy提供的的量化方法有量化蒸馏训练和离线量化，量化蒸馏训练通过模型训练来获得量化模型，离线量化不需要模型训练即可完成模型的量化。 FastDeploy 对两种方式产出的量化模型均能部署。
+基于PaddleSlim，目前FastDeploy一键模型自动压缩提供的的量化方法有量化蒸馏训练和离线量化，量化蒸馏训练通过模型训练来获得量化模型，离线量化不需要模型训练即可完成模型的量化。 FastDeploy 对两种方式产出的量化模型均能部署。

 两种方法的主要对比如下表所示:
 | 量化方法 | 量化过程耗时 | 量化模型精度 | 模型体积 | 推理速度 |
@@ -25,44 +25,115 @@ FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDe
 |   离线量化      |  无需训练，耗时短 |  比量化蒸馏训练稍低       | 两者一致   | 两者一致   |  
 |   量化蒸馏训练      |  需要训练，耗时稍高 |  较未量化模型有少量损失 | 两者一致   |两者一致   |  

-### 使用FastDeploy一键模型量化工具来量化模型
-Fastdeploy基于PaddleSlim, 为用户提供了一键模型量化的工具，请参考如下文档进行模型量化。
- [FastDeploy 一键模型量化](../../tools/quantization/)
-当用户获得产出的量化模型之后，即可以使用FastDeploy来部署量化模型。
+### 使用FastDeploy一键模型自动化压缩工具来量化模型
+FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具，请参考如下文档进行一键模型自动化压缩。
+- [FastDeploy 一键模型自动化压缩](../../tools/auto_compression/)
+当用户获得产出的压缩模型之后，即可以使用FastDeploy来部署压缩模型。


-## 量化benchmark
+## 量化模型 Benchmark

-目前, FastDeploy已支持的模型量化如下表所示:
+目前, FastDeploy支持自动化压缩,并完成部署测试的模型的Runtime Benchmark和端到端Benchmark如下所示.
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.

 ### YOLO 系列
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT         |    GPU    |  14.13        |  11.22      |      1.26         | 37.6  | 36.6 | 量化蒸馏训练 |
-| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime     |    CPU    |  183.68       |    100.39   |      1.83         | 37.6  | 33.1 |量化蒸馏训练 |
-| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference  |    CPU    |      226.36   |   152.27     |      1.48         |37.6 | 36.8 | 量化蒸馏训练 |
-| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT         |    GPU    |       12.89        |   8.92          |  1.45             | 42.5 | 40.6|量化蒸馏训练 |
-| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | ONNX Runtime     |    CPU    |   345.85            |  131.81           |      2.60         |42.5| 36.1|量化蒸馏训练 |
-| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |         366.41      |    131.70         |     2.78          |42.5| 41.2|量化蒸馏训练 |
-| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT          |    GPU    |     30.43          |      15.40       |       1.98        | 51.1| 50.8|量化蒸馏训练 |
-| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     971.27          |  471.88           |  2.06             | 51.1 | 42.5|量化蒸馏训练 |
-| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |          1015.70     |      562.41       |    1.82           |51.1 | 46.3|量化蒸馏训练 |
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT   |    GPU    |  7.87    | 4.51 |  4.31     | 3.17     |      2.48         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | Paddle-TensorRT  |    GPU   |  7.99    |  None |  4.46    | 3.31     |      2.41         | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime   |    CPU    |  176.41      |    91.90   |  None |  None |      1.90        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference|    CPU    |      213.73  |   130.19     |  None  | None |   1.64     |37.6 | 35.2 | 量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT  |    GPU    |       9.47    |   3.23    |  4.09      |2.81    |  3.37            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | Paddle-TensorRT |    GPU    |       9.31    | None|  4.17  | 2.95       |  3.16            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)          | ONNX Runtime     |    CPU    |   334.65     |  126.38      | None | None|     2.65   |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |    352.87   |    123.12    |None | None|     2.87         |42.5| 40.8|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT   |    GPU    |     27.47    |  6.52   |  6.74| 5.19|    5.29       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | Paddle-TensorRT |    GPU    |     27.87|None|6.91|5.86      |      4.76       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     996.65        |  467.15 |None|None          |  2.13           | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |     995.85  |     477.93|None|None      |   2.08         |51.1 | 46.2|量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT   |    GPU    |  24.61   | 21.20 |  20.78     | 20.94     |      1.18         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | Paddle-TensorRT  |    GPU   |  23.53    |  None |  21.98    | 19.84     |      1.28        | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime   |    CPU    |  197.323      |    110.99   |  None |  None |      1.78        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference|    CPU    |      235.73  |   144.82     |  None  | None |   1.63     |37.6 | 35.2 | 量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT  |    GPU    |       15.66    |   11.30   |  10.25      |9.59   |  1.63           | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | Paddle-TensorRT |    GPU    |       15.03   | None|  11.36 | 9.32       |  1.61            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)          | ONNX Runtime     |    CPU    |   348.21    |  126.38      | None | None| 2.82       |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |    352.87   |    121.64    |None | None|    3.04       |42.5| 40.8|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT   |    GPU    |     36.47   |  18.81  |  20.33| 17.58|    2.07      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | Paddle-TensorRT |    GPU    |     37.06|None|20.26|17.53    |      2.11      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     988.85       |  478.08 |None|None          |  2.07          | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |     1031.73 |     500.12|None|None      |   2.06         |51.1 | 46.2|量化蒸馏训练 |

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试数据为COCO2017验证集中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.


 ### PaddleClas系列
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 Top1 | INT8 Top1 |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime         |    CPU    |  86.87        |  59 .32     |      1.46         | 79.12  | 78.87|  离线量化|
-| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  7.85        |  5.42      |      1.45         | 79.12  | 79.06 | 离线量化 |
-| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | ONNX Runtime |    CPU    |      40.32   |   16.87     |      2.39         |77.89 | 75.09 |离线量化 |
-| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | TensorRT  |    GPU    |      5.10   |   3.35     |      1.52         |77.89 | 76.86 | 离线量化 |
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  3.55 | 0.99|0.98|1.06  |      3.62      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle-TensorRT  |    GPU    |  3.46 |None |0.87|1.03  |      3.98      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime    |    CPU    |  76.14       |  35.43  |None|None  |     2.15        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle Inference  |    CPU    |  76.21       |  24.01 |None|None  |     3.17       | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | TensorRT  |    GPU    |     0.91 |   0.43 |0.49 | 0.54    |      2.12       |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | Paddle-TensorRT   |    GPU    |  0.88|   None| 0.49|0.51 |      1.80      |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | ONNX Runtime |    CPU    |     30.53   |   9.59|None|None    |     3.18       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        |  Paddle Inference  |    CPU    |     12.29  |   4.68  |     None|None|2.62       |77.89 | 71.36 |离线量化 |

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试数据为ImageNet-2012验证集中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  4.92| 2.28|2.24|2.23 |      2.21     | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle-TensorRT  |    GPU    |  4.48|None |2.09|2.10 |      2.14   | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime    |    CPU    |  77.43    |  41.90 |None|None  |     1.85        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle Inference  |    CPU    |   80.60     |  27.75 |None|None  |     2.90     | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | TensorRT  |    GPU    |     2.19 |   1.48|1.57| 1.57   |      1.48     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | Paddle-TensorRT   |    GPU    |  2.04|   None| 1.47|1.45 |      1.41     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | ONNX Runtime |    CPU    |     34.02  |   12.97|None|None    |    2.62       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        |  Paddle Inference  |    CPU    |    16.31 |   7.42  |     None|None| 2.20      |77.89 | 71.36 |离线量化 |
+
+
+
+### PaddleDetection系列
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  27.90 | 6.39 |6.44|5.95    |      4.67       | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | Paddle-TensorRT |    GPU    |  30.89     |None  |  13.78 |14.01    |      2.24       | 51.4  | 50.5| 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize)  | ONNX Runtime |    CPU    |     1057.82 |   449.52 |None|None    |      2.35        |51.4 | 50.0 |量化蒸馏训练 |
+
+NOTE:
+- TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  35.75 | 15.42 |20.70|20.85  |      2.32      | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | Paddle-TensorRT |    GPU    | 33.48    |None  |  18.47 |18.03   |     1.81       | 51.4  | 50.5| 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize)  | ONNX Runtime |    CPU    |     1067.17 |   461.037 |None|None    |      2.31        |51.4 | 50.0 |量化蒸馏训练 |
+
+
+
+### PaddleSeg系列
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize)  | Paddle Inference |    CPU    |     1138.04|   602.62 |None|None     |      1.89      |77.37 | 71.62 |量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize)  | Paddle Inference |    CPU    |     4726.65|   4134.91|None|None     |      1.14      |77.37 | 71.62 |量化蒸馏训练 |
@@ -1,11 +1,79 @@
 [English](../en/quantize.md) | 简体中文

 # 量化加速
+量化是一种流行的模型压缩方法，量化后的模型拥有更小的体积和更快的推理速度.
+FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDeploy支持部署量化后的模型, 帮助用户实现推理加速.

-简要介绍量化加速的原理。

-目前量化支持在哪些硬件及后端的使用
+## FastDeploy 多个引擎和硬件支持量化模型部署
+当前，FastDeploy中多个推理后端可以在不同硬件上支持量化模型的部署. 支持情况如下:
+
+| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT |
+| :-----------| :--------   | :--------------- | :------- |
+|   CPU       |  支持        |  支持            |          |  
+|   GPU       |             |                  | 支持      |
+
+
+## 模型量化
+
+### 量化方法
+基于PaddleSlim, 目前FastDeploy提供的的量化方法有量化蒸馏训练和离线量化, 量化蒸馏训练通过模型训练来获得量化模型, 离线量化不需要模型训练即可完成模型的量化. FastDeploy 对两种方式产出的量化模型均能部署.
+
+两种方法的主要对比如下表所示:
+| 量化方法 | 量化过程耗时 | 量化模型精度 | 模型体积 | 推理速度 |
+| :-----------| :--------| :-------| :------- | :------- |
+|   离线量化      |  无需训练，耗时短 |  比量化蒸馏训练稍低       | 两者一致   | 两者一致   |  
+|   量化蒸馏训练      |  需要训练，耗时稍高 |  较未量化模型有少量损失 | 两者一致   |两者一致   |  
+
+### 用户使用FastDeploy一键模型量化工具来量化模型
+Fastdeploy基于PaddleSlim, 为用户提供了一键模型量化的工具，请参考如下文档进行模型量化.
+- [FastDeploy 一键模型量化](../../tools/quantization/)
+当用户获得产出的量化模型之后，即可以使用FastDeploy来部署量化模型.
+

 ## 量化示例
+目前, FastDeploy已支持的模型量化如下表所示:

-这里一个表格，展示目前支持的量化列表(跳转到相应的example下去)，精度、性能
+### YOLO 系列
+| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT         |    GPU    |  8.79       |  5.17     |      1.70         | 37.6  | 36.6 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime     |    CPU    |  176.34      |    92.95   |      1.90        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference  |    CPU    |      217.05  |   133.31     |     1.63         |37.6 | 36.8 | 量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT         |    GPU    |       8.60       |   5.16         |  1.67            | 42.5 | 40.6|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | ONNX Runtime     |    CPU    |   338.60           |  128.58          |      2.60         |42.5| 36.1|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |        356.62     |    125.72        |     2.84         |42.5| 41.2|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT          |    GPU    |     24.57         |      9.40     |      2.61       | 51.1| 50.8|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     976.88         |  462.69          |  2.11            | 51.1 | 42.5|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |         1022.55    |     490.87      |   2.08         |51.1 | 46.3|量化蒸馏训练 |
+
+上表中的数据, 为模型量化前后，在FastDeploy部署的Runtime推理性能.
+- 测试数据为COCO2017验证集中的图片.
+- 推理时延为在不同Runtime上推理的时延, 单位是毫秒.
+- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+
+
+### PaddleDetection系列
+| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP |量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  24.52       |  11.53    |      2.13        | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize)  | ONNX Runtime |    CPU    |     1085.62 |   457.56     |      2.37        |51.4 | 50.0 |量化蒸馏训练 |
+
+上表中的数据, 为模型量化前后，在FastDeploy部署的Runtime推理性能.
+- 测试图片为COCO val2017中的图片.
+- 推理时延为在不同Runtime上推理的时延, 单位是毫秒.
+- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+
+
+### PaddleClas系列
+| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 Top1 | INT8 Top1 |量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime         |    CPU    |  77.20       |  40.08     |     1.93        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  3.70        | 1.80      |      2.06      | 79.12  | 79.06 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | ONNX Runtime |    CPU    |     30.99   |   10.24    |     3.03        |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | TensorRT  |    GPU    |     1.80  |   0.58    |      3.10       |77.89 | 76.86 | 离线量化 |
+
+上表中的数据, 为模型量化前后，在FastDeploy部署的Runtime推理性能.
+- 测试数据为ImageNet-2012验证集中的图片.
+- 推理时延为在不同Runtime上推理的时延, 单位是毫秒.
+- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
@@ -37,12 +37,6 @@ function(config_fastdeploy_executable_link_flags TARGET_NAME)
  endif()
 endfunction()

-# Usage: add_fastdeploy_executable_cc_files(xxx_var vision detection)
-function(add_fastdeploy_executable_cc_files CC_FILES_VAR FIELD SUB_FIELD)
-  file(GLOB_RECURSE _EXAMPLE_SRCS ${PROJECT_SOURCE_DIR}/examples/${FIELD}/${SUB_FIELD}/*/cpp/*.cc)
-  set(${CC_FILES_VAR} ${_EXAMPLE_SRCS} PARENT_SCOPE)
-endfunction()
-
 set(EXAMPLES_NUM 0)
 function(add_fastdeploy_executable FIELD CC_FILE)
  # temp target name/file var in function scope
@@ -55,7 +49,11 @@ function(add_fastdeploy_executable FIELD CC_FILE)
    add_executable(${TEMP_TARGET_NAME} ${TEMP_TARGET_FILE})
    target_link_libraries(${TEMP_TARGET_NAME} PUBLIC fastdeploy)
    if(TARGET gflags)
-      target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags pthread)
+      if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+        target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags pthread)
+      else()
+        target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags)
+      endif()
    endif()
    config_fastdeploy_executable_link_flags(${TEMP_TARGET_NAME})
    math(EXPR _EXAMPLES_NUM "${EXAMPLES_NUM} + 1")
@@ -78,22 +76,13 @@ if(BUILD_EXAMPLES AND ENABLE_VISION)
  if(EXISTS ${PROJECT_SOURCE_DIR}/examples/vision)
    message(STATUS "")
    message(STATUS "*************FastDeploy Vision Examples Summary**********")
-    set(ALL_VISION_SUD_FIELDS classification
-                              detection
-                              facedet
-                              faceid
-                              keypointdetection
-                              matting
-                              ocr
-                              segmentation)
-    if(NOT ANDROID)
-      list(APPEND ALL_VISION_SUD_FIELDS tracking)
+    file(GLOB_RECURSE ALL_VISION_EXAMPLE_SRCS ${PROJECT_SOURCE_DIR}/examples/vision/*/*/cpp/*.cc)
+    if(ANDROID)
+      file(GLOB_RECURSE TRACKING_SRCS ${PROJECT_SOURCE_DIR}/examples/vision/tracking/*/cpp/*.cc)
+      list(REMOVE_ITEM ALL_VISION_EXAMPLE_SRCS ${TRACKING_SRCS})
    endif()
-    foreach(_SUB_FIELD ${ALL_VISION_SUD_FIELDS})
-      add_fastdeploy_executable_cc_files(_SUB_CC_FILES vision ${_SUB_FIELD})
-      foreach(_CC_FILE ${_SUB_CC_FILES})
-        add_fastdeploy_executable(vision ${_CC_FILE})
-      endforeach()
+    foreach(_CC_FILE ${ALL_VISION_EXAMPLE_SRCS})
+      add_fastdeploy_executable(vision ${_CC_FILE})
    endforeach()
    message(STATUS "  [FastDeploy Executable Path]        : ${EXECUTABLE_OUTPUT_PATH}")
  endif()
@@ -1,3 +1,4 @@
+[English](README_en.md) | 简体中文

 # 前端AI应用

@@ -19,7 +20,7 @@
 |目标检测|[ScrewDetection、FaceDetection](./web_demo/src/pages/cv/detection/)| <img src="https://user-images.githubusercontent.com/26592129/196874536-b7fa2c0a-d71f-4271-8c40-f9088bfad3c9.png" height="200px">|
 |人像分割背景替换|[HumanSeg](./web_demo/src/pages/cv/segmentation/HumanSeg)|<img src="https://user-images.githubusercontent.com/26592129/196874452-4ef2e770-fbb3-4a35-954b-f871716d6669.png" height="200px">|
 |物体识别|[GestureRecognition、ItemIdentification](./web_demo/src/pages/cv/recognition/)|<img src="https://user-images.githubusercontent.com/26592129/196874416-454e6bb0-4ebd-4b51-a88a-8c40614290ae.png" height="200px">|
-|PP-OCRv3|[TextDetection、TextRecognition](./web_demo/src/pages/cv/ocr/)|<img src="https://user-images.githubusercontent.com/26592129/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png" height="200px">|
+|OCR|[TextDetection、TextRecognition](./web_demo/src/pages/cv/ocr/)|<img src="https://user-images.githubusercontent.com/26592129/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png" height="200px">|


 ## 微信小程序Demo使用
@@ -0,0 +1,40 @@
+English | [简体中文](README.md)
+
+# Front-end AI application
+
+The development of artificial intelligence technology has led to industrial upgrading in the fields of computer vision(CV) and natural language processing(NLP). In addition, the deployment of AI models in browsers to achieve front-end intelligence has already provided good basic conditions with the steady increase in computing power on PCs and mobile devices, iterative updates of model compression technologies, and the continuous emergence of various innovative needs.
+In response to the difficulty of deploying AI deep learning models on the front-end, Baidu has open-sourced the Paddle.js front-end deep learning model deployment framework, which can easily deploy deep learning models into front-end projects.
+
+# Introduction of Paddle.js
+
+[Paddle.js](https://github.com/PaddlePaddle/Paddle.js) is a web sub-project of Baidu `PaddlePaddle`, an open source deep learning framework running in the browser. `Paddle.js` can load the deep learning model trained by `PaddlePaddle`, and convert it into a browser-friendly model through the model conversion tool `paddlejs-converter` of `Paddle.js`, which is easy to use for online reasoning and prediction. `Paddle.js` supports running in browsers of `WebGL/WebGPU/WebAssembly`, and can also run in the environment of Baidu applet and WeChat applet.
+
+Finally, we can launch AI functions in front-end application scenarios such as browsers and mini-program using `Paddle.js`, including but not limited to AI capabilities such as object detection, image segmentation, OCR, and item classification.
+
+## Web Demo
+
+Refer to this [document](./WebDemo_en.md) for steps to run computer vision demo in the browser.
+
+|demo|web demo directory|visualization|
+|-|-|-|
+|object detection|[ScrewDetection、FaceDetection](./web_demo/src/pages/cv/detection/)| <img src="https://user-images.githubusercontent.com/26592129/196874536-b7fa2c0a-d71f-4271-8c40-f9088bfad3c9.png" height="200px">|
+|human segmentation|[HumanSeg](./web_demo/src/pages/cv/segmentation/HumanSeg)|<img src="https://user-images.githubusercontent.com/26592129/196874452-4ef2e770-fbb3-4a35-954b-f871716d6669.png" height="200px">|
+|classification|[GestureRecognition、ItemIdentification](./web_demo/src/pages/cv/recognition/)|<img src="https://user-images.githubusercontent.com/26592129/196874416-454e6bb0-4ebd-4b51-a88a-8c40614290ae.png" height="200px">|
+|OCR|[TextDetection、TextRecognition](./web_demo/src/pages/cv/ocr/)|<img src="https://user-images.githubusercontent.com/26592129/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png" height="200px">|
+
+
+## Wechat Mini-program
+
+Run the official demo reference in the WeChat mini-program [document](./mini_program/README.md)
+
+|Name|Directory|
+|-|-|
+|OCR Text Detection| [ocrdetecXcx](./mini_program/ocrdetectXcx/) |
+|OCR Text Recognition| [ocrXcx](./mini_program/ocrXcx/) |
+|object detection| coming soon |
+|Image segmentation | coming soon |
+|Item Category| coming soon |
+
+## Contributor
+
+Thanks to Paddle Paddle Developer Expert (PPDE) Chen Qianhe (github: [chenqianhe](https://github.com/chenqianhe)) for the Web demo, mini-program.
@@ -1,3 +1,5 @@
+[English](WebDemo_en.md) | 简体中文
+
 # Web Demo介绍

 - [简介](#0)
@@ -0,0 +1,176 @@
+English | [简体中文](WebDemo.md)
+
+# Introduction to Web Demo
+
+- [Introduction](#0)
+- [1. Quick Start](#1)
+- [2. npm package call](#2)
+- [3. Model Replacement](#3)
+- [4. custom hyperparameters](#4)
+- [5. Other](#5)
+
+<a name="0"></a>
+## Introduction
+
+Based on [Paddle.js](https://github.com/PaddlePaddle/Paddle.js), this project implements computer vision tasks such as target detection, portrait segmentation, OCR, and item classification in the browser.
+
+
+|demo name|web demo component|source directory|npm package|
+|-|-|-|-|
+|Face Detection|[FaceDetection](./web_demo/src/pages/cv/detection/FaceDetection/)| [facedetect](./package/packages/paddlejs-models/facedetect)|[@paddle-js-models/ facedetect](https://www.npmjs.com/package/@paddle-js-models/facedetect)|
+|Screw Detection|[ScrewDetection](./web_demo/src/pages/cv/detection/ScrewDetection)| [detect](./package/packages/paddlejs-models/detect)|[@paddle-js-models/detect](https://www.npmjs.com/package/@paddle-js-models/detect)|
+|Portrait segmentation background replacement|[HumanSeg](./web_demo/src/pages/cv/segmentation/HumanSeg)|[humanseg](./package/packages/paddlejs-models/humanseg)|[@paddle-js-models/ humanseg](https://www.npmjs.com/package/@paddle-js-models/humanseg)|
+|Gesture Recognition AI Guessing Shell|[GestureRecognition](./web_demo/src/pages/cv/recognition/GestureRecognition)|[gesture](./package/packages/paddlejs-models/gesture)|[@paddle-js- models/gesture](https://www.npmjs.com/package/@paddle-js-models/gesture)|
+|1000 Item Identification|[ItemIdentification](./web_demo/src/pages/cv/recognition/ItemIdentification)|[mobilenet](./package/packages/paddlejs-models/mobilenet)|[@paddle-js-models/ mobilenet](https://www.npmjs.com/package/@paddle-js-models/mobilenet)|
+|Text Detection|[TextDetection](./web_demo/src/pages/cv/ocr/TextDetection)|[ocrdetection](./package/packages/paddlejs-models/ocrdetection)|[@paddle-js-models/ocrdet](https://www.npmjs.com/package/@paddle-js-models/ocrdet)|
+|Text Recognition|[TextRecognition](./web_demo/src/pages/cv/ocr/TextRecognition)|[ocr](./package/packages/paddlejs-models/ocr)|[@paddle-js-models/ocr](https://www.npmjs.com/package/@paddle-js-models/ocr)|
+
+
+<a name="1"></a>
+## 1. Quick Start
+
+This section describes how to run the official demo directly in the browser.
+
+**1. Install Node.js**
+
+Download the `Node.js` installation package suitable for your platform from the `Node.js` official website https://nodejs.org/en/download/ and install it.
+
+**2. Install demo dependencies and start**
+Execute the following command in the `./web_demo` directory:
+
+````
+# install dependencies
+npm install
+# start demo
+npm run dev
+````
+
+Open the URL `http://localhost:5173/main/index.html` in the browser to quickly experience running computer vision tasks in the browser.
+
+![22416f4a3e7d63f950b838be3cd11e80](https://user-images.githubusercontent.com/26592129/196685868-93ab53bd-cb2e-44ff-a56b-50c1781b8679.jpg)
+
+
+<a name="2"></a>
+## 2. npm package call
+
+This section introduces how to use npm packages. Each demo provides an easy-to-use interface. Users only need to initialize and upload images to get the results. The steps are as follows:
+1. Call the module
+2. Initialize the model
+3. Pass in input, perform prediction
+
+Taking OCR as an example, in a front-end project, the `@paddle-js-models/ocr` package is used as follows:
+
+````
+// 1. Call the ocr module
+import * as ocr from '@paddle-js-models/ocr';
+
+// 2. Initialize the ocr model
+await ocr.init();
+
+// 3. Pass in an image of type HTMLImageElement as input and get the result
+const res = await ocr.recognize(img);
+
+// Print the text coordinates and text content obtained by the OCR model
+console.log(res.text);
+console.log(res.points);
+````
+
+<a name="3"></a>
+## 3. Model replacement
+
+Due to the limitations of the front-end environment and computing resources, when deploying deep learning models on the front-end, we have stricter requirements on the performance of the models. In short, the models need to be lightweight enough. In theory, the smaller the input shape of the model and the smaller the model size, the smaller the flops of the corresponding model, and the smoother the front-end operation. Based on experience, the model storage deployed with `Paddle.js` should not exceed *5M* as much as possible, and the actual situation depends on the hardware and computing resources.
+
+In practical applications, models are often customized according to vertical scenarios, and the official demo supports modifying incoming parameters to replace models.
+
+Take the OCR demo as an example, [ocr.init()function](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/package/packages/paddlejs-models/ocr/src/index.ts#L52), contains the default initialization model link, if you want to replace the model, please refer to the following steps.
+
+Step 1: Convert the model to js format:
+````
+# Install paddlejsconverter
+pip3 install paddlejsconverter
+# Convert the model format, the input model is the inference model
+paddlejsconverter --modelPath=./inference.pdmodel --paramPath=./inference.pdiparams --outputDir=./ --useGPUOpt=True
+# Note: The useGPUOpt option is not enabled by default. If the model is used on the gpu backend (webgl/webgpu), enable useGPUOpt. If the model is running on (wasm/plain js), do not enable it.
+````
+
+After the export is successful, files such as `model.json chunk_1.dat` will appear in the local directory, which are the network structure and model parameter binary files corresponding to the js model.
+
+Step 2: Upload the exported js model to a server that supports cross-domain access. For the CORS configuration of the server, refer to the following image:
+![image](https://user-images.githubusercontent.com/26592129/196612669-5233137a-969c-49eb-b8c7-71bef5088686.png)
+
+
+Step 3: Modify the code to replace the default model. Take the OCR demo as an example, modify the [model initialization code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/src/pages/cv/ocr/TextRecognition/TextRecognition.vue#L64) in the OCR web demo , i.e.
+
+````
+await ocr.init();
+change into:
+await ocr.init({modelPath: "https://js-models.bj.bcebos.com/PaddleOCR/PP-OCRv3/ch_PP-OCRv3_det_infer_js_960/model.json"}); # The first parameter passes in the new text Check dictionary type parameter
+````
+
+Re-execute the following command in the demo directory to experience the new model effect.
+````
+npm run dev
+````
+
+<a name="4"></a>
+## 4. custom hyperparameters
+
+**Custom preprocessing parameters**
+
+In different computer vision tasks, different models may have different preprocessing parameters, such as mean, std, keep_ratio and other parameters. After replacing the model, the preprocessing parameters also need to be modified. A simple solution for customizing preprocessing parameters is provided in the npm package published by paddle.js. You only need to pass in custom parameters when calling the model initialization function.
+
+````
+# Default parameter initialization
+await model.init();
+
+Custom parameter initialization
+const Config = {mean: [0.5, 0.5, 0.5], std: [0.5, 0.5, 0.5], keepratio: false};
+await model.init(Config);
+````
+
+Taking the OCR text detection demo as an example, to modify the mean and std parameters of the model preprocessing, you only need to pass in the custom mean and std parameters when the model is initialized.
+````
+await ocr.init();
+change into:
+const detConfig = {mean: [0.5, 0.5, 0.5], std: [0.5, 0.5, 0.5]};
+await ocr.init(detConfig); # The first parameter passes in the new text detection model link
+````
+
+**Custom postprocessing parameters**
+
+Similarly, the npm package published by paddle.js also provides a custom solution for post-processing parameters.
+
+````
+# run with default parameters
+await model.predict();
+
+# custom post-processing parameters
+const postConfig = {thresh: 0.5};
+await model.predict(Config);
+````
+
+Take the OCR text detection demo as an example, modify the parameters of the text detection post-processing to achieve the effect of expanding the text detection frame, and modify the OCR web demo to execute the [model prediction code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/web_demo/src/pages/cv/ocr/TextRecognition/TextRecognition.vue#L99), ie:
+
+````
+const res = await ocr.recognize(img, { canvas: canvas.value });
+change into:
+// Define hyperparameters, increase the unclip_ratio parameter from 1.5 to 3.5
+const detConfig = {shape: 960, thresh: 0.3, box_thresh: 0.6, unclip_ratio:3.5};
+const res = await ocr.recognize(img, { canvas: canvas.value }, detConfig);
+````
+
+Note: Different tasks have different post-processing parameters. For detailed parameters, please refer to the API in the npm package.
+
+<a name="5"></a>
+## 5. Others
+
+The converted model of `Paddle.js` can not only be used in the browser, but also can be run in the Baidu mini-program and WeChat mini-program environment.
+
+|Name|Directory|
+|-|-|
+|OCR Text Detection| [ocrdetecXcx](./mini_program/ocrdetectXcx/) |
+|OCR Text Recognition| [ocrXcx](./mini_program/ocrXcx/) |
+|target detection| coming soon |
+| Image segmentation | coming soon |
+|Item Category| coming soon |
+
@@ -1,3 +1,4 @@
+[English](README_en.md) | 简体中文

 # Paddle.js微信小程序Demo

@@ -100,27 +101,26 @@ wx.canvasGetImageData({

 <a name="4"></a>
 ## 4. 常见问题
-### 4.1 出现报错 `Invalid context type [webgl2] for Canvas#getContext`

-可以不管，不影响正常代码运行和demo功能
+- 4.1 出现报错 `Invalid context type [webgl2] for Canvas#getContext`

-### 4.2 预览看不到结果
+    **答:** 可以不管，不影响正常代码运行和demo功能

-建议尝试真机调试
+- 4.2 预览看不到结果

-### 4.3 微信开发者工具出现黑屏，然后出现超多报错
+    **答:** 建议尝试真机调试

-重启微信开发者工具
+- 4.3 微信开发者工具出现黑屏，然后出现超多报错

-### 4.4 模拟和真机调试结果不一致；模拟检测不到文本等
+    **答:** 重启微信开发者工具

-可以以真机为准；
+- 4.4 模拟和真机调试结果不一致；模拟检测不到文本等

-模拟检测不到文本等可以尝试随意改动下代码（增删换行等）再点击编译
+    **答:** 可以以真机为准；模拟检测不到文本等可以尝试随意改动下代码（增删换行等）再点击编译


-### 4.5 手机调试或运行时出现 长时间无反应等提示
+- 4.5 手机调试或运行时出现 长时间无反应等提示

-请继续等待，模型推理需要一定时间
+    **答:** 请继续等待，模型推理需要一定时间


@@ -0,0 +1,125 @@
+English | [中文](README.md)
+
+# Paddle.js WeChat mini-program Demo
+
+- [1. Introduction](#1)
+- [2. Project Start](#2)
+  * [2.1 Preparations](#21)
+  * [2.2 Startup steps](#22)
+  * [2.3 visualization](#23)
+- [3. Model inference pipeline](#3)
+- [4. FAQ](#4)
+
+<a name="1"></a>
+## 1 Introduction
+
+
+This directory contains the text detection, text recognition mini-program demo, by using [Paddle.js](https://github.com/PaddlePaddle/Paddle.js) and [Paddle.js WeChat mini-program plugin](https://mp.weixin.qq.com/wxopen/plugindevdoc?appid=wx7138a7bb793608c3&token=956931339&lang=zh_CN) to complete the text detection frame selection effect on the mini-program using the computing power of the user terminal.
+
+<a name="2"></a>
+## 2. Project start
+
+<a name="21"></a>
+### 2.1 Preparations
+* [Apply for a WeChat mini-program account](https://mp.weixin.qq.com/)
+* [WeChat Mini Program Developer Tools](https://developers.weixin.qq.com/miniprogram/dev/devtools/download.html)
+* Front-end development environment preparation: node, npm
+* Configure the server domain name in the mini-program management background, or open the developer tool [do not verify the legal domain name]
+
+For details, please refer to [document.](https://mp.weixin.qq.com/wxamp/devprofile/get_profile?token=1132303404&lang=zh_CN)
+
+<a name="22"></a>
+### 2.2 Startup steps
+
+#### **1. Clone the demo code**
+````sh
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy/examples/application/js/mini_program
+````
+
+#### **2. Enter the mini-program directory and install dependencies**
+
+````sh
+# Run the text recognition demo and enter the ocrXcx directory
+cd ./ocrXcx && npm install
+# Run the text detection demo and enter the ocrdetectXcx directory
+# cd ./ocrdetectXcx && npm install
+````
+
+#### **3. WeChat mini-program import code**
+Open WeChat Developer Tools --> Import --> Select a directory and enter relevant information
+
+#### **4. Add Paddle.js WeChat mini-program plugin**
+Mini Program Management Interface --> Settings --> Third Party Settings --> Plugin Management --> Add Plugins --> Search for `wx7138a7bb793608c3` and add
+[Reference document](https://developers.weixin.qq.com/miniprogram/dev/framework/plugin/using.html)
+
+#### **5. Build dependencies**
+Click on the menu bar in the developer tools: Tools --> Build npm
+
+Reason: The node_modules directory will not be involved in compiling, uploading and packaging. If a small program wants to use npm packages, it must go through the process of "building npm". After the construction is completed, a miniprogram_npm directory will be generated, which will store the built and packaged npm packages. It is the npm package that the mini-program actually uses. *
+[Reference Documentation](https://developers.weixin.qq.com/miniprogram/dev/devtools/npm.html)
+
+<a name="23"></a>
+### 2.3 visualization
+
+<img src="https://user-images.githubusercontent.com/43414102/157648579-cdbbee61-9866-4364-9edd-a97ac0eda0c1.png" width="300px">
+
+<a name="3"></a>
+## 3. Model inference pipeline
+
+```typescript
+// Introduce paddlejs and paddlejs-plugin, register the mini-program environment variables and the appropriate backend
+import * as paddlejs from '@paddlejs/paddlejs-core';
+import '@paddlejs/paddlejs-backend-webgl';
+const plugin = requirePlugin('paddlejs-plugin');
+plugin.register(paddlejs, wx);
+
+// Initialize the inference engine
+const runner = new paddlejs.Runner({modelPath, feedShape, mean, std});
+await runner.init();
+
+// get image information
+wx.canvasGetImageData({
+    canvasId: canvasId,
+    x: 0,
+    y: 0,
+    width: canvas.width,
+    height: canvas.height,
+    success(res) {
+        // inference prediction
+        runner.predict({
+            data: res.data,
+            width: canvas.width,
+            height: canvas.height,
+        }, function (data) {
+            // get the inference result
+            console.log(data)
+        });
+    }
+});
+````
+
+<a name="4"></a>
+## 4. FAQ
+
+- 4.1 An error occurs `Invalid context type [webgl2] for Canvas#getContext`
+
+    **A:** You can leave it alone, it will not affect the normal code operation and demo function
+
+- 4.2 Preview can't see the result
+
+    **A:** It is recommended to try real machine debugging
+
+- 4.3 A black screen appears in the WeChat developer tool, and then there are too many errors
+
+    **A:** Restart WeChat Developer Tools
+
+- 4.4 The debugging results of the simulation and the real machine are inconsistent; the simulation cannot detect the text, etc.
+
+    **A:** The real machine can prevail;
+    If the simulation cannot detect the text, etc., you can try to change the code at will (add, delete, newline, etc.) and then click to compile
+
+
+- 4.5 Prompts such as no response for a long time appear when the phone is debugged or running
+
+    **A:** Please continue to wait, model inference will take some time
@@ -1,3 +1,5 @@
+[English](README_en.md) | 简体中文
+
 # Paddle.js Model Module介绍

 该部分是基于 Paddle.js 进行开发的模型库，主要提供 Web 端可直接引入使用模型的能力。
@@ -0,0 +1,41 @@
+English | [简体中文](README.md)
+
+# Introduction to Paddle.js Demo Module
+
+This part is a model library developed based on Paddle.js, which mainly provides the ability to directly introduce and use models on the web side.
+
+| demo name | source directory | npm package |
+| - | - | - |
+| face detection | [facedetect](./packages/paddlejs-models/facedetect) | [@paddle-js-models/facedetect](https://www.npmjs.com/package/@paddle-js-models/facedetect) |
+| Screw detection | [detect](./packages/paddlejs-models/detect) | [@paddle-js-models/detect](https://www.npmjs.com/package/@paddle-js-models/detect ) |
+| Portrait segmentation background replacement | [humanseg](./packages/paddlejs-models/humanseg) | [@paddle-js-models/humanseg](https://www.npmjs.com/package/@paddle-js-models/humanseg) |
+| Gesture Recognition AI Guessing Shell | [gesture](./packages/paddlejs-models/gesture) | [@paddle-js-models/gesture](https://www.npmjs.com/package/@paddle-js-models/gesture) |
+| 1000 Item Recognition | [mobilenet](./packages/paddlejs-models/mobilenet) | [@paddle-js-models/mobilenet](https://www.npmjs.com/package/@paddle-js-models/mobilenet) |
+| Text Detection | [ocrdetection](./packages/paddlejs-models/ocrdetection) | [@paddle-js-models/ocrdet](https://www.npmjs.com/package/@paddle-js-models/ocrdet ) |
+| Text Recognition | [ocr](./packages/paddlejs-models/ocr) | [@paddle-js-models/ocr](https://www.npmjs.com/package/@paddle-js-models/ocr) |
+
+## Usage
+
+This part is Menorepo built with `pnpm`
+
+### Install dependencies
+
+````sh
+pnpm i
+````
+
+### Development
+See Package.json for development testing with `yalc`.
+
+````sh
+pnpm run dev:xxx
+````
+
+### Overall Introduction
+
+1. Use rollup to package the code of commonjs and es specifications at one time; at the same time, it is extensible; at present, there are some problems with the dependent cv library; there is no configuration for umd packaging.
+2. The d.ts file is generated based on api-extractor during packaging, and the introduction of ts is supported to generate our package
+3. Support testing based on jest and display test related coverage, etc.
+4. Maintain code style based on ts and eslint to ensure better code development
+5. Generate custom keywords based on conventional-changelog-cli and generate changelog accordingly
+6. Implement local packaging development and testing based on yalc
@@ -8,10 +8,14 @@
 | Segmentation   | 语义分割，输入图像，给出图像中每个像素的分类及置信度          | [SegmentationResult](../../docs/api/vision_results/segmentation_result.md) |
 | Classification | 图像分类，输入图像，给出图像的分类结果和置信度             | [ClassifyResult](../../docs/api/vision_results/classification_result.md)   |
 | FaceDetection | 人脸检测，输入图像，检测图像中人脸位置，并返回检测框坐标及人脸关键点             | [FaceDetectionResult](../../docs/api/vision_results/face_detection_result.md)   |
+| FaceAlignment |  人脸对齐(人脸关键点检测)，输入图像，返回人脸关键点            | [FaceAlignmentResult](../../docs/api/vision_results/face_alignment_result.md)   |
 | KeypointDetection   | 关键点检测，输入图像，返回图像中人物行为的各个关键点坐标和置信度         | [KeyPointDetectionResult](../../docs/api/vision_results/keypointdetection_result.md) |
 | FaceRecognition | 人脸识别，输入图像，返回可用于相似度计算的人脸特征的embedding            | [FaceRecognitionResult](../../docs/api/vision_results/face_recognition_result.md)   |
 | Matting | 抠图，输入图像，返回图片的前景每个像素点的Alpha值            | [MattingResult](../../docs/api/vision_results/matting_result.md)   |
 | OCR | 文本框检测，分类，文本框内容识别，输入图像，返回文本框坐标，文本框的方向类别以及框内的文本内容            | [OCRResult](../../docs/api/vision_results/ocr_result.md)   |
+| MOT | 多目标跟踪，输入图像，检测图像中物体位置，并返回检测框坐标，对象id及类别置信度        | [MOTResult](../../docs/api/vision_results/mot_result.md)   |
+| HeadPose | 头部姿态估计，返回头部欧拉角            | [HeadPoseResult](../../docs/api/vision_results/headpose_result.md)   |
+
 ## FastDeploy API设计

 视觉模型具有较有统一任务范式，在设计API时（包括C++/Python），FastDeploy将视觉模型的部署拆分为四个步骤
@@ -1,25 +1,48 @@
 # PaddleClas 量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.

-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。

 ## 下载量化完成的PaddleClas模型
 用户也可以直接下载下表中的量化模型进行部署.
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 Top1 | INT8 Top1 |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | ONNX Runtime         |    CPU    |  86.87        |  59 .32     |      1.46         | 79.12  | 78.87|  离线量化|
-| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  7.85        |  5.42      |      1.45         | 79.12  | 79.06 | 离线量化 |
-| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)             | ONNX Runtime |    CPU    |      40.32   |   16.87     |      2.39         |77.89 | 75.09 |离线量化 |
-| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)             | TensorRT  |    GPU    |      5.10   |   3.35     |      1.52         |77.89 | 76.86 | 离线量化 |

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试图片为ImageNet-2012验证集中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  3.55 | 0.99|0.98|1.06  |      3.62      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle-TensorRT  |    GPU    |  3.46 |None |0.87|1.03  |      3.98      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | ONNX Runtime    |    CPU    |  76.14       |  35.43  |None|None  |     2.15        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle Inference  |    CPU    |  76.21       |  24.01 |None|None  |     3.17       | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | TensorRT  |    GPU    |     0.91 |   0.43 |0.49 | 0.54    |      2.12       |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | Paddle-TensorRT   |    GPU    |  0.88|   None| 0.49|0.51 |      1.80      |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | ONNX Runtime |    CPU    |     30.53   |   9.59|None|None    |     3.18       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        |  Paddle Inference  |    CPU    |     12.29  |   4.68  |     None|None|2.62       |77.89 | 71.36 |离线量化 |
+
+### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  4.92| 2.28|2.24|2.23 |      2.21     | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle-TensorRT  |    GPU    |  4.48|None |2.09|2.10 |      2.14   | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | ONNX Runtime    |    CPU    |  77.43    |  41.90 |None|None  |     1.85        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle Inference  |    CPU    |   80.60     |  27.75 |None|None  |     2.90     | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | TensorRT  |    GPU    |     2.19 |   1.48|1.57| 1.57   |      1.48     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | Paddle-TensorRT   |    GPU    |  2.04|   None| 1.47|1.45 |      1.41     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | ONNX Runtime |    CPU    |     34.02  |   12.97|None|None    |    2.62       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        |  Paddle Inference  |    CPU    |    16.31 |   7.42  |     None|None| 2.20      |77.89 | 71.36 |离线量化 |

 ## 详细部署文档

@@ -1,4 +1,4 @@
-# PaddleClas 量化模型 Python部署示例
+# PaddleClas 量化模型 C++部署示例
 本目录下提供的`infer.cc`,可以帮助用户快速完成PaddleClas量化模型在CPU/GPU上的部署推理加速.

 ## 部署准备
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)

 ## 以量化后的ResNet50_Vd模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -26,8 +26,10 @@ tar -xvf resnet50_vd_ptq.tar
 wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg


-# 在CPU上使用Paddle-Inference推理量化模型
+# 在CPU上使用ONNX Runtime推理量化模型
 ./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 2
 ```
@@ -21,8 +21,8 @@ const char sep = '/';

 void InitAndInfer(const std::string& model_dir, const std::string& image_file,
                  const fastdeploy::RuntimeOption& option) {
-  auto model_file = model_dir + sep + "inference.pdmodel";
-  auto params_file = model_dir + sep + "inference.pdiparams";
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
  auto config_file = model_dir + sep + "inference_cls.yaml";

  auto model = fastdeploy::vision::classification::PaddleClasModel(
@@ -67,7 +67,11 @@ int main(int argc, char* argv[]) {
    option.UseGpu();
    option.UseTrtBackend();
    option.SetTrtInputShape("inputs",{1, 3, 224, 224});
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }

  std::string model_dir = argv[1];
  std::string test_image = argv[2];
@@ -0,0 +1,10 @@
+rm -rf build
+mkdir build
+
+cd build
+
+#/xieyunyao/project/FastDeploy
+
+cmake .. -DFASTDEPLOY_INSTALL_DIR=/xieyunyao/project/FastDeploy
+
+make -j
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)


 ## 以量化后的ResNet50_Vd模型为例, 进行部署
@@ -22,8 +22,10 @@ wget https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar
 tar -xvf resnet50_vd_ptq.tar
 wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg

-# 在CPU上使用Paddle-Inference推理量化模型
+# 在CPU上使用ONNX Runtime推理量化模型
 python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device cpu --backend ort
 # 在GPU上使用TensorRT推理量化模型
 python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device gpu --backend pptrt
 ```
@@ -48,6 +48,11 @@ def build_option(args):
        ) == "gpu", "TensorRT backend require inferences on device GPU."
        option.use_trt_backend()
        option.set_trt_input_shape("inputs", min_shape=[1, 3, 224, 224])
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
    elif args.backend.lower() == "ort":
        option.use_ort_backend()
    elif args.backend.lower() == "paddle":
@@ -1,22 +1,43 @@
 # PaddleDetection 量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.

-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)

 ## 下载量化完成的PP-YOLOE-l模型
-用户也可以直接下载下表中的量化模型进行部署.
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  43.83        |  31.57      |      1.39         | 51.4  | 50.7 | 量化蒸馏训练 |
-| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | ONNX Runtime |    CPU    |      1085.18  |   475.55     |      2.29         |51.4 | 50.0 |量化蒸馏训练 |
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  27.90 | 6.39 |6.44|5.95    |      4.67       | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | Paddle-TensorRT |    GPU    |  30.89     |None  |  13.78 |14.01    |      2.24       | 51.4  | 50.5 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar)  | ONNX Runtime |    CPU    |     1057.82 |   449.52 |None|None    |      2.35        |51.4 | 50.0 |量化蒸馏训练 |
+
+NOTE:
+- TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  35.75 | 15.42 |20.70|20.85  |      2.32      | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | Paddle-TensorRT |    GPU    | 33.48    |None  |  18.47 |18.03   |     1.81       | 51.4  | 50.5 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar)  | ONNX Runtime |    CPU    |     1067.17 |   461.037 |None|None    |      2.31        |51.4 | 50.0 |量化蒸馏训练 |

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试图片为COCO val2017中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.

 ## 详细部署文档

@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)

 ## 以量化后的PP-YOLOE-l模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -30,4 +30,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 2
 ```
@@ -71,7 +71,15 @@ int main(int argc, char* argv[]) {
    option.UseTrtBackend();
    option.SetTrtInputShape("inputs",{1, 3, 640, 640});
    option.SetTrtInputShape("scale_factor",{1,2});
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
  }
+    else if (flag == 3) {
+    option.UseCpu();
+    option.UsePaddleBackend();
+    }

  std::string model_dir = argv[1];
  std::string test_image = argv[2];
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)


 ## 以量化后的PP-YOLOE-l模型为例, 进行部署
@@ -26,4 +26,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device cpu --backend ort
 # 在GPU上使用TensorRT推理量化模型
 python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device gpu --backend pptrt
 ```
@@ -49,6 +49,11 @@ def build_option(args):
        option.set_trt_cache_file(os.path.join(args.model, "model.trt"))
        option.set_trt_input_shape("image", min_shape=[1, 3, 640, 640])
        option.set_trt_input_shape("scale_factor", min_shape=[1, 2])
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
    elif args.backend.lower() == "ort":
        option.use_ort_backend()
    elif args.backend.lower() == "paddle":
@@ -1,22 +1,42 @@
 # YOLOv5量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.

-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)

 ## 下载量化完成的YOLOv5s模型
-用户也可以直接下载下表中的量化模型进行部署.
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)            | TensorRT         |    GPU    |  14.13        |  11.22      |      1.26         | 37.6  | 36.6 | 量化蒸馏训练 |
-| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)             | Paddle Inference  |    CPU    |      226.36   |   152.27     |      1.48         |37.6 | 36.8 |量化蒸馏训练 |
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | TensorRT   |    GPU    |  7.87    | 4.51 |  4.31     | 3.17     |      2.48         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | Paddle-TensorRT  |    GPU   |  7.99    |  None |  4.46    | 3.31     |      2.41         | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | ONNX Runtime   |    CPU    |  176.41      |    91.90   |  None |  None |      1.90        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | Paddle Inference|    CPU    |      213.73  |   130.19     |  None  | None |   1.64     |37.6 | 35.2 | 量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | TensorRT   |    GPU    |  24.61   | 21.20 |  20.78     | 20.94     |      1.18         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | Paddle-TensorRT  |    GPU   |  23.53    |  None |  21.98    | 19.84     |      1.28        | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | ONNX Runtime   |    CPU    |  197.323      |    110.99   |  None |  None |      1.78        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | Paddle Inference|    CPU    |      235.73  |   144.82     |  None  | None |   1.63     |37.6 | 35.2 | 量化蒸馏训练 |
+

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试图片为COCO val2017中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.

 ## 详细部署文档

@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv5s模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -31,4 +31,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_demo yolov5s_quant 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_demo yolov5s_quant 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo yolov5s_quant 000000014439.jpg 2
 ```
@@ -68,7 +68,11 @@ int main(int argc, char* argv[]) {
  } else if (flag == 1) {
    option.UseGpu();
    option.UseTrtBackend();
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }

  std::string model_dir = argv[1];
  std::string test_image = argv[2];
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.


 ## 以量化后的YOLOv5s模型为例, 进行部署
@@ -26,4 +26,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer.py --model yolov5s_quant --image 000000014439.jpg --device cpu --backend paddle
 # 在GPU上使用TensorRT推理量化模型
 python infer.py --model yolov5s_quant --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model yolov5s_quant --image 000000014439.jpg --device gpu --backend pptrt
 ```
@@ -47,6 +47,11 @@ def build_option(args):
        assert args.device.lower(
        ) == "gpu", "TensorRT backend require inference on device GPU."
        option.use_trt_backend()
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
    elif args.backend.lower() == "ort":
        option.use_ort_backend()
    elif args.backend.lower() == "paddle":
@@ -1,23 +1,42 @@
 # YOLOv6量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
-
-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.

+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 ## 下载量化完成的YOLOv6s模型
-用户也可以直接下载下表中的量化模型进行部署.
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | TensorRT  |    GPU    |       9.47    |   3.23    |  4.09      |2.81    |  3.37            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | Paddle-TensorRT |    GPU    |       9.31    | None|  4.17  | 2.95       |  3.16            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)          | ONNX Runtime     |    CPU    |   334.65     |  126.38      | None | None|     2.65   |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)             | Paddle Inference  |    CPU    |    352.87   |    123.12    |None | None|     2.87         |42.5| 40.8|量化蒸馏训练 |
+
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | TensorRT  |    GPU    |       15.66    |   11.30   |  10.25      |9.59   |  1.63           | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | Paddle-TensorRT |    GPU    |       15.03   | None|  11.36 | 9.32       |  1.61            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)          | ONNX Runtime     |    CPU    |   348.21    |  126.38      | None | None| 2.82       |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)             | Paddle Inference  |    CPU    |    352.87   |    121.64    |None | None|    3.04       |42.5| 40.8|量化蒸馏训练 |
+

-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- | ------ |
-| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar)             | TensorRT         |    GPU    |       12.89        |   8.92          |  1.45             | 42.5 | 40.6| 量化蒸馏训练 |
-| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar)            | Paddle Inference  |    CPU    |         366.41      |    131.70         |     2.78          |42.5| 41.2|量化蒸馏训练 |

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试图片为COCO val2017中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.

 ## 详细部署文档

@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv6s模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -22,13 +22,15 @@ cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-0.4.0
 make -j

 #下载FastDeloy提供的yolov6s量化模型文件和测试图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar
-tar -xvf yolov6s_quant.tar
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_qat_model.tar
+tar -xvf yolov6s_qat_model.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg


 # 在CPU上使用Paddle-Inference推理量化模型
-./infer_demo yolov6s_quant 000000014439.jpg 0
+./infer_demo yolov6s_qat_model 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
-./infer_demo yolov6s_quant 000000014439.jpg 1
+./infer_demo yolov6s_qat_model 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo yolov6s_qat_model 000000014439.jpg 2
 ```
@@ -68,7 +68,11 @@ int main(int argc, char* argv[]) {
  } else if (flag == 1) {
    option.UseGpu();
    option.UseTrtBackend();
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }

  std::string model_dir = argv[1];
  std::string test_image = argv[2];
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv6s模型为例, 进行部署
 ```bash
@@ -17,12 +17,14 @@ git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd examples/slim/yolov6/python

 #下载FastDeloy提供的yolov6s量化模型文件和测试图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar
-tar -xvf yolov6s_quant.tar
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_qat_model.tar
+tar -xvf yolov6s_qat_model.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg

 # 在CPU上使用Paddle-Inference推理量化模型
-python infer.py --model yolov6s_quant --image 000000014439.jpg --device cpu --backend paddle
+python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device cpu --backend paddle
 # 在GPU上使用TensorRT推理量化模型
-python infer.py --model yolov6s_quant --image 000000014439.jpg --device gpu --backend trt
+python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device gpu --backend pptrt
 ```
@@ -47,6 +47,11 @@ def build_option(args):
        assert args.device.lower(
        ) == "gpu", "TensorRT backend require inference on device GPU."
        option.use_trt_backend()
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
    elif args.backend.lower() == "ort":
        option.use_ort_backend()
    elif args.backend.lower() == "paddle":
@@ -1,23 +1,40 @@
 # YOLOv7量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.

-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)

 ## 下载量化完成的YOLOv7模型
-用户也可以直接下载下表中的量化模型进行部署.
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)

-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | TensorRT          |    GPU    |     30.43          |      15.40       |       1.98        | 51.1| 50.8| 量化蒸馏训练 |
-| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)          | Paddle Inference  |    CPU    |          1015.70     |      562.41       |    1.82           |51.1 | 46.3| 量化蒸馏训练 |

-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
- 测试图片为COCO val2017中的图片.
- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | TensorRT   |    GPU    |     27.47    |  6.52   |  6.74| 5.19|    5.29       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | Paddle-TensorRT |    GPU    |     27.87|None|6.91|5.86      |      4.76       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | ONNX Runtime     |    CPU    |     996.65        |  467.15 |None|None          |  2.13           | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | Paddle Inference  |    CPU    |     995.85  |     477.93|None|None      |   2.08         |51.1 | 46.2|量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | TensorRT   |    GPU    |     36.47   |  18.81  |  20.33| 17.58|    2.07      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | Paddle-TensorRT |    GPU    |     37.06|None|20.26|17.53    |      2.11      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | ONNX Runtime     |    CPU    |     988.85       |  478.08 |None|None          |  2.07          | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | Paddle Inference  |    CPU    |     1031.73 |     500.12|None|None      |   2.06         |51.1 | 46.2|量化蒸馏训练 |

 ## 详细部署文档

@@ -9,7 +9,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv7模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -31,4 +31,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_demo yolov7_quant 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_demo yolov7_quant 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo yolov7_quant 000000014439.jpg 2
 ```
@@ -68,7 +68,11 @@ int main(int argc, char* argv[]) {
  } else if (flag == 1) {
    option.UseGpu();
    option.UseTrtBackend();
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }

  std::string model_dir = argv[1];
  std::string test_image = argv[2];
@@ -8,7 +8,7 @@

 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.

 ## 以量化后的YOLOv7模型为例, 进行部署
 ```bash
@@ -25,4 +25,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer.py --model yolov7_quant --image 000000014439.jpg --device cpu --backend paddle
 # 在GPU上使用TensorRT推理量化模型
 python infer.py --model yolov7_quant --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model yolov7_quant --image 000000014439.jpg --device gpu --backend pptrt
 ```
@@ -47,6 +47,11 @@ def build_option(args):
        assert args.device.lower(
        ) == "gpu", "TensorRT backend require inference on device GPU."
        option.use_trt_backend()
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
    elif args.backend.lower() == "ort":
        option.use_ort_backend()
    elif args.backend.lower() == "paddle":
@@ -11,4 +11,8 @@ include_directories(${FASTDEPLOY_INCS})

 add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
 # 添加FastDeploy库依赖
-target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
+else()
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags)
+endif()
@@ -35,9 +35,9 @@ void PrintUsage() {
 }

 bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
-  if (FLAG_device == "gpu") {
+  if (FLAGS_device == "gpu") {
    option->UseGpu();
-    if (FLAG_backend == "ort") {
+    if (FLAGS_backend == "ort") {
      option->UseOrtBackend();
    } else if (FLAGS_backend == "paddle") {
      option->UsePaddleBackend();
@@ -54,24 +54,24 @@ bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
    } else if (FLAGS_backend == "default") {
      return true;
    } else {
-      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAG_backend << " is not supported." << std::endl;
+      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAGS_backend << " is not supported." << std::endl;
      return false;
    }
-  } else if (FLAG_device == "cpu") {
+  } else if (FLAGS_device == "cpu") {
    if (FLAGS_backend == "ort") {
      option->UseOrtBackend();
    } else if (FLAGS_backend == "ov") {
      option->UseOpenVINOBackend();
    } else if (FLAGS_backend == "paddle") {
      option->UsePaddleBackend();
-    } else if (FLAGS_backend = "default") {
+    } else if (FLAGS_backend == "default") {
      return true;
    } else {
-      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAG_backend << " is not supported." << std::endl;
+      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAGS_backend << " is not supported." << std::endl;
      return false;
    }
  } else {
-    std::cerr << "Only support device CPU/GPU now, "  << FLAG_device << " is not supported." << std::endl;
+    std::cerr << "Only support device CPU/GPU now, "  << FLAGS_device << " is not supported." << std::endl;
    return false;
  }

@@ -16,7 +16,6 @@ cd FastDeploy/examples/vision/facealign/pfld/python
 ## 原版ONNX模型
 wget https://bj.bcebos.com/paddlehub/fastdeploy/pfld-106-lite.onnx
 wget https://bj.bcebos.com/paddlehub/fastdeploy/facealign_input.png
-
 # CPU推理
 python infer.py --model pfld-106-lite.onnx --image facealign_input.png --device cpu
 # GPU推理
@@ -17,11 +17,11 @@ def parse_arguments():
    parser.add_argument(
        "--backend",
        type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
    parser.add_argument(
        "--enable_trt_fp16",
-        type=bool,
+        type=ast.literal_eval,
        default=False,
        help="whether enable fp16 in trt/paddle_trt backend")
    return parser.parse_args()
@@ -0,0 +1,7 @@
+# 头部姿态模型
+
+FastDeploy目前支持如下人脸对齐模型部署
+
+| 模型 | 说明 | 模型格式 | 版本 |
+| :--- | :--- | :------- | :--- |
+| [omasaht/headpose-fsanet-pytorch](./fsanet) | FSANet 系列模型 | ONNX | [CommitID:002549c](https://github.com/omasaht/headpose-fsanet-pytorch/commit/002549c) |
@@ -0,0 +1,25 @@
+# FSANet 模型部署
+
+## 模型版本说明
+
+- [FSANet](https://github.com/omasaht/headpose-fsanet-pytorch/commit/002549c)
+
+## 支持模型列表
+
+目前FastDeploy支持如下模型的部署
+
+- [FSANet 模型](https://github.com/omasaht/headpose-fsanet-pytorch)
+
+## 下载预训练模型
+
+为了方便开发者的测试，下面提供了PFLD导出的各系列模型，开发者可直接下载使用。
+
+| 模型                                                               | 参数大小    | 精度    | 备注 |
+|:---------------------------------------------------------------- |:----- |:----- | :------ |
+| [fsanet-1x1.onnx](https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-1x1.onnx) | 1.2M | - |
+| [fsanet-var.onnx](https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-var.onnx) | 1.2MB | - |
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
@@ -0,0 +1,18 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+include(${FASTDEPLOY_INSTALL_DIR}/utils/gflags.cmake)
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# 添加FastDeploy库依赖
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
+else()
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags)
+endif()
@@ -0,0 +1,74 @@
+# FSANet C++部署示例
+
+本目录下提供`infer.cc`快速完成FSANet在CPU/GPU，以及GPU上通过TensorRT加速部署的示例。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上CPU推理为例，在本目录执行如下命令即可完成编译测试，保证 FastDeploy 版本0.6.0以上(x.x.x >= 0.6.0)支持FSANet模型
+
+```bash
+mkdir build
+cd build
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+#下载官方转换好的 FSANet 模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-var.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/headpose_input.png
+# CPU推理
+./infer_demo --model fsanet-var.onnx --image headpose_input.png --device cpu
+# GPU推理
+./infer_demo --model fsanet-var.onnx --image headpose_input.png --device gpu
+# GPU上TensorRT推理
+./infer_demo --model fsanet-var.onnx --image headpose_input.png --device gpu --backend trt
+```
+
+运行完成可视化结果如下图所示
+
+<div width="520">
+<img width="500" height="514" float="left" src="https://user-images.githubusercontent.com/19977378/198279932-3eee424e-98a2-4249-bdeb-0f79127cbc9d.png">
+</div>
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## FSANet C++接口
+
+### FSANet 类
+
+```c++
+fastdeploy::vision::headpose::FSANet(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+FSANet模型加载和初始化，其中model_file为导出的ONNX模型格式。
+**参数**
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX格式
+#### Predict函数
+> ```c++
+> FSANet::Predict(cv::Mat* im, HeadPoseResult* result)
+> ```
+>
+> 模型预测接口，输入图像直接输出头部姿态预测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 头部姿态预测结果, HeadPoseResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/)
+### 类成员变量
+用户可按照自己的实际需求，修改下列预处理参数，从而影响最终的推理和部署效果
+> > * **size**(vector&lt;int&gt;): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112]
+- [模型介绍](../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../docs/api/vision_results/)
+- [如何切换模型推理后端引擎](../../../../../docs/cn/faq/how_to_change_backend.md)
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#include "gflags/gflags.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(image, "", "Path of the image file.");
+DEFINE_string(device, "cpu",
+              "Type of inference device, support 'cpu' or 'gpu'.");
+DEFINE_string(backend, "default",
+              "The inference runtime backend, support: ['default', 'ort', "
+              "'paddle', 'ov', 'trt', 'paddle_trt']");
+DEFINE_bool(use_fp16, false, "Whether to use FP16 mode, only support 'trt' and 'paddle_trt' backend");
+
+void PrintUsage() {
+  std::cout << "Usage: infer_demo --model model_path --image img_path --device [cpu|gpu] --backend "
+               "[default|ort|paddle|ov|trt|paddle_trt] "
+               "--use_fp16 false"
+            << std::endl;
+  std::cout << "Default value of device: cpu" << std::endl;
+  std::cout << "Default value of backend: default" << std::endl;
+  std::cout << "Default value of use_fp16: false" << std::endl;
+}
+
+bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
+  if (FLAGS_device == "gpu") {
+    option->UseGpu();
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleBackend();
+    } else if (FLAGS_backend == "trt" || 
+               FLAGS_backend == "paddle_trt") {
+      option->UseTrtBackend();
+      option->SetTrtInputShape("images", {1, 3, 64, 64});
+      if (FLAGS_backend == "paddle_trt") {
+        option->EnablePaddleToTrt();
+      }
+      if (FLAGS_use_fp16) {
+        option->EnableTrtFP16();
+      }
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else if (FLAGS_device == "cpu") {
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "ov") {
+      option->UseOpenVINOBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleBackend();
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else {
+    std::cerr << "Only support device CPU/GPU now, "  << FLAGS_device << " is not supported." << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option)) {
+    PrintUsage();
+    return -1;
+  }
+
+  auto model = fastdeploy::vision::headpose::FSANet(FLAGS_model, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return -1;
+  }
+
+  auto im = cv::imread(FLAGS_image);
+  auto im_bak = im.clone();
+
+  fastdeploy::vision::HeadPoseResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return -1;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisHeadPose(im_bak, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  
+  return 0;
+}
@@ -0,0 +1,67 @@
+# FSANet Python部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+本目录下提供`infer.py`快速完成FSANet在CPU/GPU，以及GPU上通过TensorRT加速部署的示例，保证 FastDeploy 版本 >= 0.6.0 支持FSANet模型。执行如下脚本即可完成
+
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy/examples/vision/headpose/fsanet/python
+
+# 下载FSANet模型文件和测试图片
+## 原版ONNX模型
+wget https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-var.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/headpose_input.png
+# CPU推理
+python infer.py --model fsanet-var.onnx --image headpose_input.png --device cpu
+# GPU推理
+python infer.py --model fsanet-var.onnx --image headpose_input.png --device gpu
+# TRT推理
+python infer.py --model fsanet-var.onnx --image headpose_input.png --device gpu --backend trt
+```
+
+运行完成可视化结果如下图所示
+
+<div width="520">
+<img width="500" height="514" float="left" src="https://user-images.githubusercontent.com/19977378/198279932-3eee424e-98a2-4249-bdeb-0f79127cbc9d.png">
+</div>
+
+## FSANet Python接口
+
+```python
+fd.vision.headpose.FSANet(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+FSANet 模型加载和初始化，其中model_file为导出的ONNX模型格式
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX格式时，此参数无需设定
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX
+### predict函数
+
+> ```python
+> FSANet.predict(input_image)
+> ```
+>
+> 模型预测结口，输入图像直接输出头部姿态预测结果。
+>
+> **参数**
+>
+> > * **input_image**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+> **返回**
+>
+> > 返回`fastdeploy.vision.HeadPoseResult`结构体，结构体说明参考文档[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+## 其它文档
+
+- [FSANet 模型介绍](..)
+- [FSANet C++部署](../cpp)
+- [模型预测结果说明](../../../../../docs/api/vision_results/)
+- [如何切换模型推理后端引擎](../../../../../docs/cn/faq/how_to_change_backend.md)
@@ -0,0 +1,88 @@
+import fastdeploy as fd
+import cv2
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True, help="Path of FSANet model.")
+    parser.add_argument("--image", type=str, help="Path of test image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
+    parser.add_argument(
+        "--enable_trt_fp16",
+        type=ast.literal_eval,
+        default=False,
+        help="whether enable fp16 in trt/paddle_trt backend")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+    device = args.device
+    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
+    if device == "gpu":
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            option.set_trt_input_shape("input", [1, 3, 64, 64])
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
+    else:
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
+
+    return option
+
+
+args = parse_arguments()
+
+# 配置runtime，加载模型
+runtime_option = build_option(args)
+model = fd.vision.headpose.FSANet(args.model, runtime_option=runtime_option)
+
+# for image
+im = cv2.imread(args.image)
+result = model.predict(im.copy())
+print(result)
+# 可视化结果
+vis_im = fd.vision.vis_headpose(im, result)
+cv2.imwrite("visualized_result.jpg", vis_im)
+print("Visualized result save in ./visualized_result.jpg")
@@ -0,0 +1,36 @@
+# PaddleSeg 量化模型部署
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
+注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。
+
+## 下载量化完成的PaddleSeg模型
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar))  | Paddle Inference |    CPU    |     1138.04|   602.62 |None|None     |      1.89      |77.37 | 71.62 |量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar))  | Paddle Inference |    CPU    |     4726.65|   4134.91|None|None     |      1.14      |77.37 | 71.62 |量化蒸馏训练 |
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
@@ -0,0 +1,14 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.12)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
@@ -0,0 +1,30 @@
+# PaddleSeg 量化模型 C++部署示例
+本目录下提供的`infer.cc`,可以帮助用户快速完成PaddleSeg量化模型在CPU/GPU上的部署推理加速.
+
+## 部署准备
+### FastDeploy环境准备
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+### 量化模型准备
+- 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+
+## 以量化后的PP_LiteSeg_T_STDC1_cityscapes模型为例, 进行部署
+在本目录执行如下命令即可完成编译,以及量化模型部署.
+```bash
+mkdir build
+cd build
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-0.3.0.tgz
+tar xvf fastdeploy-linux-x64-0.3.0.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-0.3.0
+make -j
+
+#下载FastDeloy提供的PP_LiteSeg_T_STDC1_cityscapes量化模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+tar -xvf PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png
+
+# 在CPU上使用Paddle-Inference推理量化模型
+./infer_demo PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ cityscapes_demo.png 1
+```
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void InitAndInfer(const std::string& model_dir, const std::string& image_file,
+                  const fastdeploy::RuntimeOption& option) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "deploy.yaml";
+
+  auto model = fastdeploy::vision::segmentation::PaddleSegModel(
+      model_file, params_file, config_file,option);
+
+  assert(model.Initialized());
+
+  auto im = cv::imread(image_file);
+  auto im_bak = im.clone();
+
+  fastdeploy::vision::SegmentationResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+
+  std::cout << res.Str() << std::endl;
+
+}
+
+// int main(int argc, char* argv[]) {
+//   if (argc < 3) {
+//     std::cout
+//         << "Usage: infer_demo path/to/model_dir path/to/image run_option, "
+//            "e.g ./infer_model ./ppseg_model_dir ./test.jpeg 0"
+//         << std::endl;
+//     std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
+//                  "with gpu; 2: run with gpu and use tensorrt backend."
+//               << std::endl;
+//     return -1;
+//   }
+
+//   fastdeploy::RuntimeOption option;
+//   option.UseCpu();
+//   option.UsePaddleBackend();
+//   std::cout<<"Xyy-debug, enable Paddle Backend==!";
+
+//   std::string model_dir = argv[1];
+//   std::string test_image = argv[2];
+//   InitAndInfer(model_dir, test_image, option);
+//   return 0;
+// }
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout << "Usage: infer_demo path/to/quant_model "
+                 "path/to/image "
+                 "run_option, "
+                 "e.g ./infer_demo ./ResNet50_vd_quant ./test.jpeg 0"
+              << std::endl;
+    std::cout << "The data type of run_option is int, 0: run on cpu with ORT "
+                 "backend; 1: run "
+                 "on gpu with TensorRT backend. "
+              << std::endl;
+    return -1;
+  }
+
+  fastdeploy::RuntimeOption option;
+  int flag = std::atoi(argv[3]);
+
+  if (flag == 0) {
+    option.UseCpu();
+    option.UseOrtBackend();
+    std::cout<<"Use ORT!"<<std::endl;
+  } else if (flag == 1) {
+    option.UseCpu();
+    option.UsePaddleBackend();
+    std::cout<<"Use PP!"<<std::endl;
+  }
+
+  std::string model_dir = argv[1];
+  std::string test_image = argv[2];
+  InitAndInfer(model_dir, test_image, option);
+  return 0;
+}
@@ -0,0 +1,28 @@
+# PaddleSeg 量化模型 Python部署示例
+本目录下提供的`infer.py`,可以帮助用户快速完成PaddleSeg量化模型在CPU/GPU上的部署推理加速.
+
+## 部署准备
+### FastDeploy环境准备
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+### 量化模型准备
+- 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+
+
+## 以量化后的PP_LiteSeg_T_STDC1_cityscapes模型为例, 进行部署
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/segmentation/paddleseg/quantize/python
+
+#下载FastDeloy提供的PP_LiteSeg_T_STDC1_cityscapes量化模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+tar -xvf PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png
+
+# 在CPU上使用Paddle-Inference推理量化模型
+python infer.py --model PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT --image cityscapes_demo.png --device cpu --backend paddle
+
+```
@@ -0,0 +1,76 @@
+import fastdeploy as fd
+import cv2
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="Path of PaddleSeg model.")
+    parser.add_argument(
+        "--image", required=True, help="Path of test image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="default",
+        help="Type of inference backend, support ort/trt/paddle/openvino, default 'openvino' for cpu, 'tensorrt' for gpu"
+    )
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="Define which GPU card used to run model.")
+    parser.add_argument(
+        "--cpu_thread_num",
+        type=int,
+        default=9,
+        help="Number of threads while inference on CPU.")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+    if args.device.lower() == "gpu":
+        option.use_gpu(0)
+
+    option.set_cpu_thread_num(args.cpu_thread_num)
+
+    if args.backend.lower() == "trt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inferences on device GPU."
+        option.use_trt_backend()
+        option.set_trt_cache_file(os.path.join(args.model, "model.trt"))
+        option.set_trt_input_shape("x", [1, 3, 256, 256], [1, 3, 1024, 1024],
+                                   [1, 3, 2048, 2048])
+    elif args.backend.lower() == "ort":
+        option.use_ort_backend()
+    elif args.backend.lower() == "paddle":
+        option.use_paddle_backend()
+    elif args.backend.lower() == "openvino":
+        assert args.device.lower(
+        ) == "cpu", "OpenVINO backend require inference on device CPU."
+        option.use_openvino_backend()
+    return option
+
+
+args = parse_arguments()
+
+# 配置runtime，加载模型
+runtime_option = build_option(args)
+model_file = os.path.join(args.model, "model.pdmodel")
+params_file = os.path.join(args.model, "model.pdiparams")
+config_file = os.path.join(args.model, "deploy.yaml")
+model = fd.vision.segmentation.PaddleSegModel(
+    model_file, params_file, config_file, runtime_option=runtime_option)
+
+# 预测图片检测结果
+im = cv2.imread(args.image)
+result = model.predict(im.copy())
+print(result)
@@ -4,49 +4,103 @@

 - [PaddleSeg develop](https://github.com/PaddlePaddle/PaddleSeg/tree/develop)

-目前FastDeploy支持如下模型的部署
+目前FastDeploy使用RKNPU2推理PPSeg支持如下模型的部署:

- [U-Net系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/unet/README.md)
- [PP-LiteSeg系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/pp_liteseg/README.md)
- [PP-HumanSeg系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/contrib/PP-HumanSeg/README.md)
- [FCN系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/fcn/README.md)
- [DeepLabV3系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/deeplabv3/README.md)
-
-【注意】如你部署的为**PP-Matting**、**PP-HumanMatting**以及**ModNet**请参考[Matting模型部署](../../matting)
+| 模型                                                                                                                                           | 参数文件大小 | 输入Shape  | mIoU   | mIoU (flip) | mIoU (ms+flip) |
+|:---------------------------------------------------------------------------------------------------------------------------------------------|:-------|:---------|:-------|:------------|:---------------|
+| [Unet-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/Unet_cityscapes_without_argmax_infer.tgz)                                       | 52MB   | 1024x512 | 65.00% | 66.02%      | 66.89%         |
+| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer.tgz)          | 31MB   | 1024x512 | 77.04% | 77.73%      | 77.46%         |
+| [PP-HumanSegV1-Lite(通用人像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/PP_HumanSegV1_Lite_infer.tgz)                                      | 543KB  | 192x192  | 86.2%  | -           | -              |
+| [PP-HumanSegV2-Lite(通用人像分割模型)](https://bj.bcebos.com/paddle2onnx/libs/PP_HumanSegV2_Lite_192x192_infer.tgz)                                  | 12MB   | 192x192  | 92.52% | -           | -              |
+| [PP-HumanSegV2-Mobile(通用人像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/PP_HumanSegV2_Mobile_192x192_infer.tgz)                          | 29MB   | 192x192  | 93.13% | -           | -              |
+| [PP-HumanSegV1-Server(通用人像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/PP_HumanSegV1_Server_infer.tgz)                                  | 103MB  | 512x512  | 96.47% | -           | -              |
+| [Portait-PP-HumanSegV2_Lite(肖像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/Portrait_PP_HumanSegV2_Lite_256x144_infer.tgz)               | 3.6M   | 256x144  | 96.63% | -           | -              |
+| [FCN-HRNet-W18-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/FCN_HRNet_W18_cityscapes_without_argmax_infer.tgz)                     | 37MB   | 1024x512 | 78.97% | 79.49%      | 79.74%         |
+| [Deeplabv3-ResNet101-OS8-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/Deeplabv3_ResNet101_OS8_cityscapes_without_argmax_infer.tgz) | 150MB  | 1024x512 | 79.90% | 80.22%      | 80.47%         |

 ## 准备PaddleSeg部署模型以及转换模型
+RKNPU部署模型前需要将Paddle模型转换成RKNN模型，具体步骤如下:
+* Paddle动态图模型转换为ONNX模型，请参考[PaddleSeg模型导出说明](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.6/contrib/PP-HumanSeg)
+* ONNX模型转换RKNN模型的过程，请参考[转换文档](../../../../../docs/cn/faq/rknpu2/export.md)进行转换。

-RKNPU部署模型前需要将模型转换成RKNN模型，其过程一般可以简化为如下步骤:
-*   Paddle动态图模型 -> ONNX模型 -> RKNN模型。
-    *   Paddle动态图模型 转换为 ONNX模型的过程请参考([PaddleSeg模型导出说明](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.6/contrib/PP-HumanSeg))。
-    *   对于ONNX模型 转换 RKNN模型的过程，请参考[转换文档](../../../../../docs/cn/faq/rknpu2/export.md)进行转换。
-        以PPHumanSeg为例，在获取到ONNX模型后，其转换为RK3588步骤如下:
-        * 编写config.yaml文件
-        ```yaml
-        model_path: ./portrait_pp_humansegv2_lite_256x144_pretrained.onnx
-        output_folder: ./
-        target_platform: RK3588
-        normalize:
-        mean: [0.5,0.5,0.5]
-        std: [0.5,0.5,0.5]
-        outputs: None
-        ```
-        * 执行转换代码
-        ```bash
-        python /path/to/fastDeploy/toosl/export.py --config_path=/path/to/fastdeploy/tools/rknpu2/config/ppset_config.yaml
-        ```
-        
-## 下载预训练模型
+## 模型转换example

-为了方便开发者的测试，下面提供了PaddleSeg导出的部分模型（导出方式为：**指定**`--input_shape`，**指定**`--output_op none`，**指定**`--without_argmax`），开发者可直接下载使用。
+下面以Portait-PP-HumanSegV2_Lite(肖像分割模型)为例子，教大家如何转换PPSeg模型到RKNN模型。
+```bash
+# 下载Paddle2ONNX仓库
+git clone https://github.com/PaddlePaddle/Paddle2ONNX

-| 任务场景             | 模型                | 模型版本(表示已经测试的版本)                                                                                                                            | 大小  | ONNX/RKNN是否支持 | ONNX/RKNN速度(ms) |
-|------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------|-----|---------------|-----------------|
-| Segmentation     | PP-LiteSeg        | [PP_LiteSeg_T_STDC1_cityscapes](https://bj.bcebos.com/fastdeploy/models/rknn2/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_3588.tgz) | -   | True/True     | 6634/5598       |
-| Segmentation     | PP-HumanSegV2Lite | [portrait](https://bj.bcebos.com/fastdeploy/models/rknn2/portrait_pp_humansegv2_lite_256x144_inference_model_without_softmax_3588.tgz)     | -   | True/True     | 456/266         |
-| Segmentation     | PP-HumanSegV2Lite | [human](https://bj.bcebos.com/fastdeploy/models/rknn2/human_pp_humansegv2_lite_192x192_pretrained_3588.tgz)                                | -   | True/True     | 496/256         |
+# 下载Paddle静态图模型并为Paddle静态图模型固定输入shape
+## 进入为Paddle静态图模型固定输入shape的目录
+cd Paddle2ONNX/tools/paddle
+## 下载Paddle静态图模型并解压
+wget https://bj.bcebos.com/paddlehub/fastdeploy/Portrait_PP_HumanSegV2_Lite_256x144_infer.tgz
+tar xvf Portrait_PP_HumanSegV2_Lite_256x144_infer.tgz
+python paddle_infer_shape.py --model_dir Portrait_PP_HumanSegV2_Lite_256x144_infer/ \
+                             --model_filename model.pdmodel \
+                             --params_filename model.pdiparams \
+                             --save_dir Portrait_PP_HumanSegV2_Lite_256x144_infer \
+                             --input_shape_dict="{'x':[1,3,144,256]}"
+
+# 静态图转ONNX模型，注意，这里的save_file请和压缩包名对齐
+paddle2onnx --model_dir Portrait_PP_HumanSegV2_Lite_256x144_infer \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --save_file Portrait_PP_HumanSegV2_Lite_256x144_infer/Portrait_PP_HumanSegV2_Lite_256x144_infer.onnx \
+            --enable_dev_version True
+
+# ONNX模型转RKNN模型
+# 将ONNX模型目录拷贝到Fastdeploy根目录
+cp -r ./Portrait_PP_HumanSegV2_Lite_256x144_infer /path/to/Fastdeploy
+# 转换模型,模型将生成在Portrait_PP_HumanSegV2_Lite_256x144_infer目录下
+python tools/rknpu2/export.py --config_path tools/rknpu2/config/RK3588/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
+```
+
+## 修改yaml配置文件
+
+在**模型转换example**中，我们对模型的shape进行了固定，因此对应的yaml文件也要进行修改，如下:
+
+**原yaml文件**
+```yaml
+Deploy:
+  input_shape:
+  - -1
+  - 3
+  - -1
+  - -1
+  model: model.pdmodel
+  output_dtype: float32
+  output_op: none
+  params: model.pdiparams
+  transforms:
+  - target_size:
+    - 256
+    - 144
+    type: Resize
+  - type: Normalize
+```
+
+**修改后的yaml文件**
+```yaml
+Deploy:
+  input_shape:
+  - 1
+  - 3
+  - 144
+  - 256
+  model: model.pdmodel
+  output_dtype: float32
+  output_op: none
+  params: model.pdiparams
+  transforms:
+  - target_size:
+    - 256
+    - 144
+    type: Resize
+  - type: Normalize
+```

 ## 详细部署文档
- [RKNN总体部署教程](../../../../../docs/cn/faq/rknpu2.md)
+- [RKNN总体部署教程](../../../../../docs/cn/faq/rknpu2/rknpu2.md)
 - [C++部署](cpp)
- [Python部署](python)
+- [Python部署](python)
@@ -41,13 +41,7 @@ fastdeploy-0.0.3目录，请移动它至thirdpartys目录下.

 ### 拷贝模型文件，以及配置文件至model文件夹
 在Paddle动态图模型 -> Paddle静态图模型 -> ONNX模型的过程中，将生成ONNX文件以及对应的yaml配置文件，请将配置文件存放到model文件夹内。
-转换为RKNN后的模型文件也需要拷贝至model，这里提供了转换好的文件，输入以下命令下载使用(模型文件为RK3588，RK3568需要重新[转换PPSeg RKNN模型](../README.md))。
-```bash
-cd model
-wget https://bj.bcebos.com/fastdeploy/models/rknn2/human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-tar xvf human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-cp -r ./human_pp_humansegv2_lite_192x192_pretrained_3588 ./model
-```
+转换为RKNN后的模型文件也需要拷贝至model，输入以下命令下载使用(模型文件为RK3588，RK3568需要重新[转换PPSeg RKNN模型](../README.md))。

 ### 准备测试图片至image文件夹
 ```bash
@@ -81,4 +75,4 @@ RKNPU上对模型的输入要求是使用NHWC格式，且图片归一化操作

 - [模型介绍](../../)
 - [Python部署](../python)
- [转换PPSeg RKNN模型文档](../README.md)
+- [转换PPSeg RKNN模型文档](../README.md)
@@ -1,3 +1,16 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <iostream>
 #include <string>
 #include "fastdeploy/vision.h"
@@ -40,11 +53,11 @@ std::string GetModelPath(std::string& model_path, const std::string& device) {

 void InferHumanPPHumansegv2Lite(const std::string& device) {
  std::string model_file =
-      "./model/human_pp_humansegv2_lite_192x192_pretrained_3588/"
-      "human_pp_humansegv2_lite_192x192_pretrained_3588.";
+      "./model/Portrait_PP_HumanSegV2_Lite_256x144_infer/"
+      "Portrait_PP_HumanSegV2_Lite_256x144_infer_rk3588.";
  std::string params_file;
  std::string config_file =
-      "./model/human_pp_humansegv2_lite_192x192_pretrained_3588/deploy.yaml";
+      "./model/Portrait_PP_HumanSegV2_Lite_256x144_infer/deploy.yaml";

  fastdeploy::RuntimeOption option = GetOption(device);
  fastdeploy::ModelFormat format = GetFormat(device);
@@ -2,7 +2,7 @@

 在部署前，需确认以下两个步骤

- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/rknpu2.md) 
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/rknpu2.md)

 【注意】如你部署的为**PP-Matting**、**PP-HumanMatting**以及**ModNet**请参考[Matting模型部署](../../../matting)

@@ -13,17 +13,13 @@
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy/examples/vision/segmentation/paddleseg/python

-# 下载模型
-wget https://bj.bcebos.com/fastdeploy/models/rknn2/human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-tar xvf human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-
 # 下载图片
 wget https://paddleseg.bj.bcebos.com/dygraph/pp_humanseg_v2/images.zip
 unzip images.zip

 # 推理
-python3 infer.py --model_file ./human_pp_humansegv2_lite_192x192_pretrained_3588/human_pp_humansegv2_lite_192x192_pretrained_3588.rknn \
-                --config_file ./human_pp_humansegv2_lite_192x192_pretrained_3588/deploy.yaml \
+python3 infer.py --model_file ./Portrait_PP_HumanSegV2_Lite_256x144_infer/Portrait_PP_HumanSegV2_Lite_256x144_infer_rk3588.rknn \
+                --config_file ./Portrait_PP_HumanSegV2_Lite_256x144_infer/deploy.yaml \
                --image images/portrait_heng.jpg
 ```

@@ -1,3 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import fastdeploy as fd
 import cv2
 import os
@@ -30,7 +43,11 @@ model_file = args.model_file
 params_file = ""
 config_file = args.config_file
 model = fd.vision.segmentation.PaddleSegModel(
-    model_file, params_file, config_file, runtime_option=runtime_option,model_format=fd.ModelFormat.RKNN)
+    model_file,
+    params_file,
+    config_file,
+    runtime_option=runtime_option,
+    model_format=fd.ModelFormat.RKNN)

 model.disable_normalize_and_permute()

@@ -33,25 +33,29 @@ void CpuInfer(const std::string& model_dir, const std::string& video_file) {
  }

  fastdeploy::vision::MOTResult result;
+  fastdeploy::vision::tracking::TrailRecorder recorder;
+  // during each prediction, data is inserted into the recorder. As the number of predictions increases,
+  // the memory will continue to grow. You can cancel the insertion through 'UnbindRecorder'.
+  // int count = 0; // unbind condition
+  model.BindRecorder(&recorder);
  cv::Mat frame;
-  int frame_id=0;
  cv::VideoCapture capture(video_file);
-  // according to the time of prediction to calculate fps
-  float fps= 0.0f;
  while (capture.read(frame)) {
    if (frame.empty()) {
-        break;
+      break;
    }
    if (!model.Predict(&frame, &result)) {
-        std::cerr << "Failed to predict." << std::endl;
-        return;
+      std::cerr << "Failed to predict." << std::endl;
+      return;
    }
+    // such as adding this code can cancel trail datat bind
+    // if(count++ == 10) model.UnbindRecorder();
    // std::cout << result.Str() << std::endl;
-    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, fps , frame_id);
+    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, 0.0, &recorder);
    cv::imshow("mot",out_img);
    cv::waitKey(30);
-    frame_id++;
  }
+  model.UnbindRecorder();
  capture.release();
  cv::destroyAllWindows();
 }
@@ -72,25 +76,29 @@ void GpuInfer(const std::string& model_dir, const std::string& video_file) {
  }

  fastdeploy::vision::MOTResult result;
+  fastdeploy::vision::tracking::TrailRecorder trail_recorder;
+  // during each prediction, data is inserted into the recorder. As the number of predictions increases,
+  // the memory will continue to grow. You can cancel the insertion through 'UnbindRecorder'.
+  // int count = 0; // unbind condition
+  model.BindRecorder(&trail_recorder);
  cv::Mat frame;
-  int frame_id=0;
  cv::VideoCapture capture(video_file);
-  // according to the time of prediction to calculate fps
-  float fps= 0.0f;
  while (capture.read(frame)) {
    if (frame.empty()) {
-        break;
+      break;
    }
    if (!model.Predict(&frame, &result)) {
-        std::cerr << "Failed to predict." << std::endl;
-        return;
+      std::cerr << "Failed to predict." << std::endl;
+      return;
    }
+    // such as adding this code can cancel trail datat bind
+    //if(count++ == 10) model.UnbindRecorder();
    // std::cout << result.Str() << std::endl;
-    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, fps , frame_id);
+    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, 0.0, &trail_recorder);
    cv::imshow("mot",out_img);
    cv::waitKey(30);
-    frame_id++;
  }
+  model.UnbindRecorder();
  capture.release();
  cv::destroyAllWindows();
 }
@@ -112,11 +120,13 @@ void TrtInfer(const std::string& model_dir, const std::string& video_file) {
  }

  fastdeploy::vision::MOTResult result;
+  fastdeploy::vision::tracking::TrailRecorder recorder;
+  //during each prediction, data is inserted into the recorder. As the number of predictions increases,
+  //the memory will continue to grow. You can cancel the insertion through 'UnbindRecorder'.
+  // int count = 0; // unbind condition
+  model.BindRecorder(&recorder);
  cv::Mat frame;
-  int frame_id=0;
  cv::VideoCapture capture(video_file);
-  // according to the time of prediction to calculate fps
-  float fps= 0.0f;
  while (capture.read(frame)) {
    if (frame.empty()) {
        break;
@@ -125,12 +135,14 @@ void TrtInfer(const std::string& model_dir, const std::string& video_file) {
        std::cerr << "Failed to predict." << std::endl;
        return;
    }
+    // such as adding this code can cancel trail datat bind
+    // if(count++ == 10) model.UnbindRecorder();
    // std::cout << result.Str() << std::endl;
-    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, fps , frame_id);
+    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, 0.0, &recorder);
    cv::imshow("mot",out_img);
    cv::waitKey(30);
-    frame_id++;
  }
+  model.UnbindRecorder();
  capture.release();
  cv::destroyAllWindows();
 }
@@ -14,7 +14,6 @@

 import fastdeploy as fd
 import cv2
-import time
 import os


@@ -60,20 +59,26 @@ config_file = os.path.join(args.model, "infer_cfg.yml")
 model = fd.vision.tracking.PPTracking(
    model_file, params_file, config_file, runtime_option=runtime_option)

+# 初始化轨迹记录器
+recorder = fd.vision.tracking.TrailRecorder()
+# 绑定记录器 注意：每次预测时，往trail_recorder里面插入数据，随着预测次数的增加，内存会不断地增长，
+# 可以通过unbind_recorder()方法来解除绑定
+model.bind_recorder(recorder)
 # 预测图片分割结果
 cap = cv2.VideoCapture(args.video)
-frame_id = 0
+# count = 0
 while True:
-    start_time = time.time()
-    frame_id = frame_id+1
    _, frame = cap.read()
    if frame is None:
        break
    result = model.predict(frame)
-    end_time = time.time()
-    fps = 1.0/(end_time-start_time)
-    img = fd.vision.vis_mot(frame, result, fps, frame_id)
+    # count += 1
+    # if count == 10:
+    #     model.unbind_recorder()
+    img = fd.vision.vis_mot(frame, result, 0.0, recorder)
    cv2.imshow("video", img)
-    cv2.waitKey(30)
+    if cv2.waitKey(30) == ord("q"):
+        break
+model.unbind_recorder()
 cap.release()
 cv2.destroyAllWindows()
@@ -21,6 +21,7 @@

 #include "fastdeploy/backends/common/multiclass_nms.h"
 #include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/core/fd_type.h"

 namespace fastdeploy {

@@ -63,6 +64,11 @@ class BaseBackend {
  virtual std::vector<TensorInfo> GetOutputInfos() = 0;
  virtual bool Infer(std::vector<FDTensor>& inputs,
                     std::vector<FDTensor>* outputs) = 0;
+  virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                             int device_id = -1) {
+    FDERROR << "Clone no support" << std::endl;
+    return nullptr;
+  }
 };

 }  // namespace fastdeploy
@@ -74,6 +74,8 @@ ov::element::Type FDDataTypeToOV(const FDDataType& type) {
  return ov::element::f32;
 }

+ov::Core OpenVINOBackend::core_;
+
 void OpenVINOBackend::InitTensorInfo(
    const std::vector<ov::Output<ov::Node>>& ov_outputs,
    std::map<std::string, TensorInfo>* tensor_infos) {
@@ -96,10 +98,6 @@ bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
    return false;
  }
  option_ = option;
-  ov::AnyMap properties;
-  if (option_.cpu_thread_num > 0) {
-    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
-  }

  std::shared_ptr<ov::Model> model = core_.read_model(model_file, params_file);

@@ -149,7 +147,19 @@ bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
    output_infos_.push_back(iter->second);
  }

+  ov::AnyMap properties;
+  if (option_.cpu_thread_num > 0) {
+    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
+  }
+  if (option_.ov_num_streams ==  -1) {
+    properties["NUM_STREAMS"] = ov::streams::AUTO;
+  } else if (option_.ov_num_streams ==  -2) {
+    properties["NUM_STREAMS"] = ov::streams::NUMA;
+  } else if (option_.ov_num_streams > 0) {
+    properties["NUM_STREAMS"] = option_.ov_num_streams;
+  }
  compiled_model_ = core_.compile_model(model, "CPU", properties);
+
  request_ = compiled_model_.create_infer_request();
  initialized_ = true;
  return true;
@@ -185,10 +195,6 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
    return false;
  }
  option_ = option;
-  ov::AnyMap properties;
-  if (option_.cpu_thread_num > 0) {
-    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
-  }

  std::shared_ptr<ov::Model> model = core_.read_model(model_file);

@@ -238,8 +244,21 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
    output_infos_.push_back(iter->second);
  }

+  ov::AnyMap properties;
+  if (option_.cpu_thread_num > 0) {
+    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
+  }
+  if (option_.ov_num_streams ==  -1) {
+    properties["NUM_STREAMS"] = ov::streams::AUTO;
+  } else if (option_.ov_num_streams ==  -2) {
+    properties["NUM_STREAMS"] = ov::streams::NUMA;
+  } else if (option_.ov_num_streams > 0) {
+    properties["NUM_STREAMS"] = option_.ov_num_streams;
+  }
  compiled_model_ = core_.compile_model(model, "CPU", properties);
+
  request_ = compiled_model_.create_infer_request();
+  
  initialized_ = true;
  return true;
 }
@@ -281,4 +300,14 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
  return true;
 }

+std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<OpenVINOBackend>();
+  auto casted_backend = dynamic_cast<OpenVINOBackend*>(new_backend.get());
+  casted_backend->option_ = option_;
+  casted_backend->request_ = compiled_model_.create_infer_request();
+  casted_backend->input_infos_.assign(input_infos_.begin(), input_infos_.end());
+  casted_backend->output_infos_.assign(output_infos_.begin(), output_infos_.end());
+  return new_backend;
+}
+
 }  // namespace fastdeploy
@@ -20,17 +20,20 @@
 #include <vector>

 #include "fastdeploy/backends/backend.h"
+#include "fastdeploy/utils/unique_ptr.h"
 #include "openvino/openvino.hpp"

 namespace fastdeploy {

 struct OpenVINOBackendOption {
-  int cpu_thread_num = 8;
+  int cpu_thread_num = -1;
+  int ov_num_streams = 1;
  std::map<std::string, std::vector<int64_t>> shape_infos;
 };

 class OpenVINOBackend : public BaseBackend {
 public:
+  static ov::Core core_;
  OpenVINOBackend() {}
  virtual ~OpenVINOBackend() = default;

@@ -54,10 +57,13 @@ class OpenVINOBackend : public BaseBackend {
  std::vector<TensorInfo> GetInputInfos() override;
  std::vector<TensorInfo> GetOutputInfos() override;

+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
+
 private:
  void InitTensorInfo(const std::vector<ov::Output<ov::Node>>& ov_outputs,
                      std::map<std::string, TensorInfo>* tensor_infos);
-  ov::Core core_;
+
  ov::CompiledModel compiled_model_;
  ov::InferRequest request_;
  OpenVINOBackendOption option_;
@@ -80,21 +80,18 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
            << std::endl;
    return false;
  }
-#ifdef ENABLE_PADDLE_FRONTEND
  char* model_content_ptr;
  int model_content_size = 0;

-  std::vector<paddle2onnx::CustomOp> custom_ops;
-  for (auto& item : option.custom_op_info_) {
-    paddle2onnx::CustomOp op;
-    strcpy(op.op_name, item.first.c_str());
-    strcpy(op.export_op_name, item.second.c_str());
-    custom_ops.emplace_back(op);
-  }
+#ifdef ENABLE_PADDLE_FRONTEND
+  paddle2onnx::CustomOp op;
+  strcpy(op.op_name, "multiclass_nms3");
+  strcpy(op.export_op_name, "MultiClassNMS");
+
  if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
                           &model_content_ptr, &model_content_size, 11, true,
-                           verbose, true, true, true, custom_ops.data(),
-                           custom_ops.size())) {
+                           verbose, true, true, true, &op,
+                           1)) {
    FDERROR << "Error occured while export PaddlePaddle to ONNX format."
            << std::endl;
    return false;
@@ -106,7 +103,7 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
  model_content_ptr = nullptr;
  return InitFromOnnx(onnx_model_proto, option, true);
 #else
-  FDERROR << "Didn't compile with PaddlePaddle frontend, you can try to "
+  FDERROR << "Didn't compile with PaddlePaddle Frontend, you can try to "
             "call `InitFromOnnx` instead."
          << std::endl;
 #endif
@@ -216,6 +216,30 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
  return true;
 }

+std::unique_ptr<BaseBackend> PaddleBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<PaddleBackend>();
+  auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
+  if(device_id > 0 && option_.use_gpu == true && device_id != option_.gpu_id) {
+    auto clone_option = option_;
+    clone_option.gpu_id = device_id;
+    clone_option.external_stream_ = stream;
+    casted_backend->InitFromPaddle(clone_option.model_file,
+                                   clone_option.params_file,
+                                   clone_option);
+    FDWARNING << "The target device id:" 
+             << device_id
+             << " is different from current device id:"
+             << option_.gpu_id
+             << ", cannot share memory with current engine."
+             << std::endl;
+    return new_backend;
+  }
+  casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
+  casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
+  casted_backend->predictor_ = std::move(predictor_->Clone(stream));
+  return new_backend;
+}
+
 #ifdef ENABLE_TRT_BACKEND
 void PaddleBackend::SetTRTDynamicShapeToConfig(const PaddleBackendOption& option) {
    std::map<std::string, std::vector<int>> max_shape;
@@ -24,6 +24,7 @@
 #include "paddle2onnx/converter.h"
 #endif
 #include "paddle_inference_api.h"  // NOLINT
+#include "fastdeploy/utils/unique_ptr.h"

 #ifdef ENABLE_TRT_BACKEND
 #include "fastdeploy/backends/tensorrt/trt_backend.h"
@@ -43,6 +44,9 @@ struct IpuOption {
 };

 struct PaddleBackendOption {
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+
 #ifdef WITH_GPU
  bool use_gpu = true;
 #else
@@ -110,6 +114,9 @@ class PaddleBackend : public BaseBackend {

  int NumOutputs() const override { return outputs_desc_.size(); }

+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
+
  TensorInfo GetInputInfo(int index) override;
  TensorInfo GetOutputInfo(int index) override;
  std::vector<TensorInfo> GetInputInfos() override;
@@ -15,11 +15,27 @@

 namespace fastdeploy {
 RKNPU2Backend::~RKNPU2Backend() {
-  if (input_attrs != nullptr) {
-    free(input_attrs);
+  // Release memory uniformly here
+  if (input_attrs_ != nullptr) {
+    free(input_attrs_);
  }
-  if (output_attrs != nullptr) {
-    free(output_attrs);
+
+  if (output_attrs_ != nullptr) {
+    free(output_attrs_);
+  }
+
+  for (uint32_t i = 0; i < io_num.n_input; i++) {
+    rknn_destroy_mem(ctx, input_mems_[i]);
+  }
+  if(input_mems_ != nullptr){
+    free(input_mems_);
+  }
+
+  for (uint32_t i = 0; i < io_num.n_output; i++) {
+    rknn_destroy_mem(ctx, output_mems_[i]);
+  }
+  if(output_mems_ != nullptr){
+    free(output_mems_);
  }
 }
 /***************************************************************
@@ -150,56 +166,85 @@ bool RKNPU2Backend::GetModelInputOutputInfos() {
  }

  // Get detailed input parameters
-  input_attrs =
+  input_attrs_ =
      (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_input);
-  memset(input_attrs, 0, io_num.n_input * sizeof(rknn_tensor_attr));
+  memset(input_attrs_, 0, io_num.n_input * sizeof(rknn_tensor_attr));
  inputs_desc_.resize(io_num.n_input);
+
+  // create input tensor memory
+  // rknn_tensor_mem* input_mems[io_num.n_input];
+  input_mems_ = (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_input);
+
+  // get input info and copy to input tensor info
  for (uint32_t i = 0; i < io_num.n_input; i++) {
-    input_attrs[i].index = i;
+    input_attrs_[i].index = i;
    // query info
-    ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]),
+    ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs_[i]),
                     sizeof(rknn_tensor_attr));
    if (ret != RKNN_SUCC) {
      printf("rknn_init error! ret=%d\n", ret);
      return false;
    }
-    std::string temp_name = input_attrs[i].name;
-    std::vector<int> temp_shape{};
-    temp_shape.resize(input_attrs[i].n_dims);
-    for (int j = 0; j < input_attrs[i].n_dims; j++) {
-      temp_shape[j] = (int)input_attrs[i].dims[j];
+    if((input_attrs_[i].fmt != RKNN_TENSOR_NHWC) &&
+        (input_attrs_[i].fmt != RKNN_TENSOR_UNDEFINED)){
+      FDERROR << "rknpu2_backend only support input format is NHWC or UNDEFINED" << std::endl;
    }

+    // copy input_attrs_ to input tensor info
+    std::string temp_name = input_attrs_[i].name;
+    std::vector<int> temp_shape{};
+    temp_shape.resize(input_attrs_[i].n_dims);
+    for (int j = 0; j < input_attrs_[i].n_dims; j++) {
+      temp_shape[j] = (int)input_attrs_[i].dims[j];
+    }
    FDDataType temp_dtype =
        fastdeploy::RKNPU2Backend::RknnTensorTypeToFDDataType(
-            input_attrs[i].type);
+            input_attrs_[i].type);
    TensorInfo temp_input_info = {temp_name, temp_shape, temp_dtype};
    inputs_desc_[i] = temp_input_info;
  }

  // Get detailed output parameters
-  output_attrs =
+  output_attrs_ =
      (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_output);
-  memset(output_attrs, 0, io_num.n_output * sizeof(rknn_tensor_attr));
+  memset(output_attrs_, 0, io_num.n_output * sizeof(rknn_tensor_attr));
  outputs_desc_.resize(io_num.n_output);
+
+  // Create output tensor memory
+  output_mems_ = (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_output);;
+
  for (uint32_t i = 0; i < io_num.n_output; i++) {
-    output_attrs[i].index = i;
+    output_attrs_[i].index = i;
    // query info
-    ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]),
+    ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs_[i]),
                     sizeof(rknn_tensor_attr));
    if (ret != RKNN_SUCC) {
      FDERROR << "rknn_query fail! ret = " << ret << std::endl;
      return false;
    }
-    std::string temp_name = output_attrs[i].name;
-    std::vector<int> temp_shape{};
-    temp_shape.resize(output_attrs[i].n_dims);
-    for (int j = 0; j < output_attrs[i].n_dims; j++) {
-      temp_shape[j] = (int)output_attrs[i].dims[j];
+
+    // If the output dimension is 3, the runtime will automatically change it to 4. 
+    // Obviously, this is wrong, and manual correction is required here.
+    int n_dims = output_attrs_[i].n_dims;
+    if((n_dims == 4) && (output_attrs_[i].dims[3] == 1)){
+      n_dims--;
+      FDWARNING << "The output[" 
+                << i
+                << "].shape[3] is 1, remove this dim." 
+                << std::endl;
    }
+
+    // copy output_attrs_ to output tensor
+    std::string temp_name = output_attrs_[i].name;
+    std::vector<int> temp_shape{};
+    temp_shape.resize(n_dims);
+    for (int j = 0; j < n_dims; j++) {
+      temp_shape[j] = (int)output_attrs_[i].dims[j];
+    }
+
    FDDataType temp_dtype =
        fastdeploy::RKNPU2Backend::RknnTensorTypeToFDDataType(
-            output_attrs[i].type);
+            output_attrs_[i].type);
    TensorInfo temp_input_info = {temp_name, temp_shape, temp_dtype};
    outputs_desc_[i] = temp_input_info;
  }
@@ -254,82 +299,77 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
    return false;
  }

-  // the input size only can be one
-  if (inputs.size() > 1) {
-    FDERROR << "[RKNPU2Backend] Size of the inputs only support 1."
-            << std::endl;
-    return false;
+  if(!this->infer_init){
+    for (uint32_t i = 0; i < io_num.n_input; i++) {
+      // Judge whether the input and output types are the same
+      rknn_tensor_type input_type =
+          fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(inputs[i].dtype);
+      if (input_type != input_attrs_[i].type) {
+        FDWARNING << "The input tensor type != model's inputs type."
+                  << "The input_type need " << get_type_string(input_attrs_[i].type)
+                  << ",but inputs["<< i << "].type is " << get_type_string(input_type)
+                  << std::endl;
+      }
+
+      // Create input tensor memory
+      input_attrs_[i].type = input_type;
+      input_attrs_[i].size = inputs[0].Nbytes();
+      input_attrs_[i].size_with_stride = inputs[0].Nbytes();
+      input_attrs_[i].pass_through = 0;
+      input_mems_[i] = rknn_create_mem(ctx, inputs[i].Nbytes());
+      if (input_mems_[i] == nullptr) {
+        FDERROR << "rknn_create_mem input_mems_ error." << std::endl;
+        return false;
+      }
+
+      // Set input tensor memory
+      ret = rknn_set_io_mem(ctx, input_mems_[i], &input_attrs_[i]);
+      if (ret != RKNN_SUCC) {
+        FDERROR << "input tensor memory rknn_set_io_mem fail! ret=" << ret
+                << std::endl;
+        return false;
+      }
+    }
+
+    for (uint32_t i = 0; i < io_num.n_output; ++i) {
+      // Most post-processing does not support the fp16 format.
+      // The unified output here is float32
+      uint32_t output_size = output_attrs_[i].n_elems * sizeof(float);
+      output_mems_[i] = rknn_create_mem(ctx, output_size);
+      if (output_mems_[i] == nullptr) {
+        FDERROR << "rknn_create_mem output_mems_ error." << std::endl;
+        return false;
+      }
+      // default output type is depend on model, this requires float32 to compute top5
+      output_attrs_[i].type = RKNN_TENSOR_FLOAT32;
+      ret = rknn_set_io_mem(ctx, output_mems_[i], &output_attrs_[i]);
+      // set output memory and attribute
+      if (ret != RKNN_SUCC) {
+        FDERROR << "output tensor memory rknn_set_io_mem fail! ret=" << ret
+                << std::endl;
+        return false;
+      }
+    }
+
+    this->infer_init = true;
  }
-
-  // Judge whether the input and output types are the same
-  rknn_tensor_type input_type =
-      fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(inputs[0].dtype);
-  if (input_type != input_attrs[0].type) {
-    FDWARNING << "The input tensor type != model's inputs type."
-              << "The input_type need " << get_type_string(input_attrs[0].type)
-              << ",but inputs[0].type is " << get_type_string(input_type)
-              << std::endl;
-  }
-
-  rknn_tensor_format input_layout =
-      RKNN_TENSOR_NHWC; // RK3588 only support NHWC
-  input_attrs[0].type = input_type;
-  input_attrs[0].fmt = input_layout;
-  input_attrs[0].size = inputs[0].Nbytes();
-  input_attrs[0].size_with_stride = inputs[0].Nbytes();
-  input_attrs[0].pass_through = 0;
-
-  // create input tensor memory
-  rknn_tensor_mem* input_mems[1];
-  input_mems[0] = rknn_create_mem(ctx, inputs[0].Nbytes());
-  if (input_mems[0] == nullptr) {
-    FDERROR << "rknn_create_mem input_mems error." << std::endl;
-    return false;
-  }
-
+  
  // Copy input data to input tensor memory
-  uint32_t width = input_attrs[0].dims[2];
-  uint32_t stride = input_attrs[0].w_stride;
-  if (width == stride) {
-    if (inputs[0].Data() == nullptr) {
-      FDERROR << "inputs[0].Data is NULL." << std::endl;
-      return false;
-    }
-    memcpy(input_mems[0]->virt_addr, inputs[0].Data(), inputs[0].Nbytes());
-  } else {
-    FDERROR << "[RKNPU2Backend] only support width == stride." << std::endl;
-    return false;
-  }
-
-  // Create output tensor memory
-  rknn_tensor_mem* output_mems[io_num.n_output];
-  for (uint32_t i = 0; i < io_num.n_output; ++i) {
-    // Most post-processing does not support the fp16 format.
-    // The unified output here is float32
-    uint32_t output_size = output_attrs[i].n_elems * sizeof(float);
-    output_mems[i] = rknn_create_mem(ctx, output_size);
-  }
-
-  // Set input tensor memory
-  ret = rknn_set_io_mem(ctx, input_mems[0], &input_attrs[0]);
-  if (ret != RKNN_SUCC) {
-    FDERROR << "input tensor memory rknn_set_io_mem fail! ret=" << ret
-            << std::endl;
-    return false;
-  }
-
-  // Set output tensor memory
-  for (uint32_t i = 0; i < io_num.n_output; ++i) {
-    // default output type is depend on model, this requires float32 to compute top5
-    output_attrs[i].type = RKNN_TENSOR_FLOAT32;
-    ret = rknn_set_io_mem(ctx, output_mems[i], &output_attrs[i]);
-    // set output memory and attribute
-    if (ret != RKNN_SUCC) {
-      FDERROR << "output tensor memory rknn_set_io_mem fail! ret=" << ret
-              << std::endl;
+  for (uint32_t i = 0; i < io_num.n_input; i++) {
+    uint32_t width = input_attrs_[i].dims[2];
+    uint32_t stride = input_attrs_[i].w_stride;
+    if (width == stride) {
+      if (inputs[i].Data() == nullptr) {
+        FDERROR << "inputs[0].Data is NULL." << std::endl;
+        return false;
+      }
+      memcpy(input_mems_[i]->virt_addr, inputs[i].Data(), inputs[i].Nbytes());
+    } else {
+      FDERROR << "[RKNPU2Backend] only support width == stride." << std::endl;
      return false;
    }
  }
+  

  // run rknn
  ret = rknn_run(ctx, nullptr);
@@ -337,7 +377,6 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
    FDERROR << "rknn run error! ret=" << ret << std::endl;
    return false;
  }
-  rknn_destroy_mem(ctx, input_mems[0]);

  // get result
  outputs->resize(outputs_desc_.size());
@@ -349,9 +388,8 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
    }
    (*outputs)[i].Resize(temp_shape, outputs_desc_[i].dtype,
                         outputs_desc_[i].name);
-    memcpy((*outputs)[i].MutableData(), (float*)output_mems[i]->virt_addr,
+    memcpy((*outputs)[i].MutableData(), (float*)output_mems_[i]->virt_addr,
           (*outputs)[i].Nbytes());
-    rknn_destroy_mem(ctx, output_mems[i]);
  }

  return true;
@@ -86,8 +86,13 @@ class RKNPU2Backend : public BaseBackend {
  std::vector<TensorInfo> inputs_desc_;
  std::vector<TensorInfo> outputs_desc_;

-  rknn_tensor_attr* input_attrs = nullptr;
-  rknn_tensor_attr* output_attrs = nullptr;
+  rknn_tensor_attr* input_attrs_ = nullptr;
+  rknn_tensor_attr* output_attrs_ = nullptr;
+
+  rknn_tensor_mem** input_mems_;
+  rknn_tensor_mem** output_mems_;
+
+  bool infer_init = false;

  RKNPU2BackendOption option_;

@@ -124,48 +124,20 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
  option_ = option;

 #ifdef ENABLE_PADDLE_FRONTEND
-  std::vector<paddle2onnx::CustomOp> custom_ops;
-  for (auto& item : option_.custom_op_info_) {
-    paddle2onnx::CustomOp op;
-    std::strcpy(op.op_name, item.first.c_str());
-    std::strcpy(op.export_op_name, item.second.c_str());
-    custom_ops.emplace_back(op);
-  }
  char* model_content_ptr;
  int model_content_size = 0;
  char* calibration_cache_ptr;
  int calibration_cache_size = 0;
  if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
                           &model_content_ptr, &model_content_size, 11, true,
-                           verbose, true, true, true, custom_ops.data(),
-                           custom_ops.size(), "tensorrt",
+                           verbose, true, true, true, nullptr,
+                           0, "tensorrt",
                           &calibration_cache_ptr, &calibration_cache_size)) {
    FDERROR << "Error occured while export PaddlePaddle to ONNX format."
            << std::endl;
    return false;
  }

-  if (option_.remove_multiclass_nms_) {
-    char* new_model = nullptr;
-    int new_model_size = 0;
-    if (!paddle2onnx::RemoveMultiClassNMS(model_content_ptr, model_content_size,
-                                          &new_model, &new_model_size)) {
-      FDERROR << "Try to remove MultiClassNMS failed." << std::endl;
-      return false;
-    }
-    delete[] model_content_ptr;
-    std::string onnx_model_proto(new_model, new_model + new_model_size);
-    delete[] new_model;
-    if (calibration_cache_size) {
-      std::string calibration_str(
-          calibration_cache_ptr,
-          calibration_cache_ptr + calibration_cache_size);
-      calibration_str_ = calibration_str;
-      delete[] calibration_cache_ptr;
-    }
-    return InitFromOnnx(onnx_model_proto, option, true);
-  }
-
  std::string onnx_model_proto(model_content_ptr,
                               model_content_ptr + model_content_size);
  delete[] model_content_ptr;
@@ -313,6 +285,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
    BuildTrtEngine();
  }

+  cudaSetDevice(option_.gpu_id);
  SetInputs(inputs);
  AllocateOutputsBuffer(outputs);

@@ -384,13 +357,17 @@ void TrtBackend::GetInputOutputInfo() {
      outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
      casted_output_tensors_[name] = FDTensor();
    }
+    io_name_index_[name] = i;
  }
  bindings_.resize(num_binds);
 }

 void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
  for (const auto& item : inputs) {
-    auto idx = engine_->getBindingIndex(item.name.c_str());
+    // auto idx = engine_->getBindingIndex(item.name.c_str());
+    auto iter = io_name_index_.find(item.name);
+    FDASSERT(iter != io_name_index_.end(), "TRTBackend SetInputs not find name:%s", item.name.c_str());
+    auto idx = iter->second; 
    std::vector<int> shape(item.shape.begin(), item.shape.end());
    auto dims = ToDims(shape);
    context_->setBindingDimensions(idx, dims);
@@ -438,7 +415,10 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
    outputs->resize(outputs_desc_.size());
  }
  for (size_t i = 0; i < outputs_desc_.size(); ++i) {
-    auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
+    // auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
+    auto idx_iter = io_name_index_.find(outputs_desc_[i].name);
+    FDASSERT(idx_iter != io_name_index_.end(), "TRTBackend Outputs not find name:%s", outputs_desc_[i].name.c_str());
+    auto idx = idx_iter->second; 
    auto output_dims = context_->getBindingDimensions(idx);

    // find the original index of output
@@ -701,4 +681,47 @@ std::vector<TensorInfo> TrtBackend::GetOutputInfos() {
  return infos;
 }

+std::unique_ptr<BaseBackend> TrtBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<TrtBackend>();
+  auto casted_backend = dynamic_cast<TrtBackend*>(new_backend.get());
+  if(device_id > 0 && device_id != option_.gpu_id) {
+    auto clone_option = option_;
+    clone_option.gpu_id = device_id;
+    clone_option.external_stream_ = stream;
+    if (option_.model_format == ModelFormat::ONNX) {
+      FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option),
+              "Clone model from ONNX failed while initialize TrtBackend.");
+    } else {
+      FDASSERT(casted_backend->InitFromPaddle(option_.model_file,
+                                              option_.params_file, clone_option),
+              "Clone model from Paddle failed while initialize TrtBackend.");
+    }
+    FDWARNING << "The target device id:" 
+          << device_id
+          << " is different from current device id:"
+          << option_.gpu_id
+          << ", cannot share memory with current engine."
+          << std::endl;
+    return new_backend;
+  }
+  cudaSetDevice(option_.gpu_id);
+  casted_backend->option_.gpu_id = option_.gpu_id;
+  if (stream) {
+    casted_backend->stream_ = reinterpret_cast<cudaStream_t>(stream);
+  } else {
+    FDASSERT(cudaStreamCreate(&casted_backend->stream_) == 0,
+           "[ERROR] Error occurs while clone calling cudaStreamCreate().");
+  }
+  casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
+  casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
+  casted_backend->outputs_order_.insert(outputs_order_.begin(), outputs_order_.end());
+  casted_backend->shape_range_info_.insert(shape_range_info_.begin(), shape_range_info_.end());
+  casted_backend->engine_ = engine_;
+  casted_backend->context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
+      casted_backend->engine_->createExecutionContext());
+  casted_backend->GetInputOutputInfo();
+  FDINFO << "TRTBackend clone finish." << std::endl;
+  return new_backend;
+}
+
 }  // namespace fastdeploy
@@ -25,6 +25,7 @@
 #include "NvOnnxParser.h"
 #include "fastdeploy/backends/backend.h"
 #include "fastdeploy/backends/tensorrt/utils.h"
+#include "fastdeploy/utils/unique_ptr.h"

 class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 public:
@@ -45,7 +46,7 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {

  void writeCalibrationCache(const void* cache,
                             size_t length) noexcept override {
-    std::cout << "NOT IMPLEMENT." << std::endl;
+    fastdeploy::FDERROR << "NOT IMPLEMENT." << std::endl;
  }

 private:
@@ -62,6 +63,11 @@ struct TrtValueInfo {
 };

 struct TrtBackendOption {
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC;
+
  int gpu_id = 0;
  bool enable_fp16 = false;
  bool enable_int8 = false;
@@ -73,10 +79,6 @@ struct TrtBackendOption {
  std::string serialize_file = "";
  bool enable_pinned_memory = false;
  void* external_stream_ = nullptr;
-
-  // inside parameter, maybe remove next version
-  bool remove_multiclass_nms_ = false;
-  std::map<std::string, std::string> custom_op_info_;
 };

 std::vector<int> toVec(const nvinfer1::Dims& dim);
@@ -103,6 +105,8 @@ class TrtBackend : public BaseBackend {
  TensorInfo GetOutputInfo(int index);
  std::vector<TensorInfo> GetInputInfos() override;
  std::vector<TensorInfo> GetOutputInfos() override;
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;

  ~TrtBackend() {
    if (parser_) {
@@ -123,6 +127,7 @@ class TrtBackend : public BaseBackend {
  std::vector<TrtValueInfo> outputs_desc_;
  std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
  std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
+  std::map<std::string, int> io_name_index_;

  std::string calibration_str_;

@@ -182,4 +182,31 @@ const FDDataType TypeToDataType<uint8_t>::dtype = UINT8;
 template <>
 const FDDataType TypeToDataType<int8_t>::dtype = INT8;

+std::string Str(const ModelFormat& f) {
+  if (f == ModelFormat::PADDLE) {
+    return "ModelFormat::PADDLE";
+  } else if (f == ModelFormat::ONNX) {
+    return "ModelFormat::ONNX";
+  }else if (f == ModelFormat::RKNN) {
+    return "ModelFormat::RKNN";
+  } else if (f == ModelFormat::TORCHSCRIPT) {
+    return "ModelFormat::TORCHSCRIPT";
+  }
+  return "UNKNOWN-ModelFormat";
+}
+
+std::ostream& operator<<(std::ostream& out, const ModelFormat& format) {
+  if (format == ModelFormat::PADDLE) {
+    out << "ModelFormat::PADDLE";
+  } else if (format == ModelFormat::ONNX) {
+    out << "ModelFormat::ONNX";
+  } else if (format == ModelFormat::RKNN) {
+    out << "ModelFormat::RKNN";
+  } else if (format == ModelFormat::TORCHSCRIPT) {
+    out << "ModelFormat::TORCHSCRIPT";
+  }
+  out << "UNKNOWN-ModelFormat";
+  return out;
+}
+
 }  // namespace fastdeploy
@@ -65,4 +65,16 @@ struct FASTDEPLOY_DECL TypeToDataType {
  static const FDDataType dtype;
 };

+/*! Deep learning model format */
+enum ModelFormat {
+  AUTOREC,      ///< Auto recognize the model format by model file name
+  PADDLE,       ///< Model with paddlepaddle format
+  ONNX,         ///< Model with ONNX format
+  RKNN,         ///< Model with RKNN format
+  TORCHSCRIPT,  ///< Model with TorchScript format
+};
+
+FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
+                                         const ModelFormat& format);
+
 }  // namespace fastdeploy
@@ -102,19 +102,6 @@ std::string Str(const Backend& b) {
  return "UNKNOWN-Backend";
 }

-std::string Str(const ModelFormat& f) {
-  if (f == ModelFormat::PADDLE) {
-    return "ModelFormat::PADDLE";
-  } else if (f == ModelFormat::ONNX) {
-    return "ModelFormat::ONNX";
-  }else if (f == ModelFormat::RKNN) {
-    return "ModelFormat::RKNN";
-  } else if (f == ModelFormat::TORCHSCRIPT) {
-    return "ModelFormat::TORCHSCRIPT";
-  }
-  return "UNKNOWN-ModelFormat";
-}
-
 std::ostream& operator<<(std::ostream& out, const Backend& backend) {
  if (backend == Backend::ORT) {
    out << "Backend::ORT";
@@ -135,20 +122,6 @@ std::ostream& operator<<(std::ostream& out, const Backend& backend) {
  return out;
 }

-std::ostream& operator<<(std::ostream& out, const ModelFormat& format) {
-  if (format == ModelFormat::PADDLE) {
-    out << "ModelFormat::PADDLE";
-  } else if (format == ModelFormat::ONNX) {
-    out << "ModelFormat::ONNX";
-  } else if (format == ModelFormat::RKNN) {
-    out << "ModelFormat::RKNN";
-  } else if (format == ModelFormat::TORCHSCRIPT) {
-    out << "ModelFormat::TORCHSCRIPT";
-  }
-  out << "UNKNOWN-ModelFormat";
-  return out;
-}
-
 bool CheckModelFormat(const std::string& model_file,
                      const ModelFormat& model_format) {
  if (model_format == ModelFormat::PADDLE) {
@@ -411,6 +384,10 @@ void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
  trt_serialize_file = cache_file_path;
 }

+void RuntimeOption::SetOpenVINOStreams(int num_streams) {
+  ov_num_streams = num_streams;
+}
+
 bool Runtime::Compile(std::vector<std::vector<FDTensor>>& prewarm_tensors,
                      const RuntimeOption& _option) {
 #ifdef ENABLE_POROS_BACKEND
@@ -582,6 +559,8 @@ bool Runtime::Infer(std::vector<FDTensor>& input_tensors,
 void Runtime::CreatePaddleBackend() {
 #ifdef ENABLE_PADDLE_BACKEND
  auto pd_option = PaddleBackendOption();
+  pd_option.model_file = option.model_file;
+  pd_option.params_file = option.params_file;
  pd_option.enable_mkldnn = option.pd_enable_mkldnn;
  pd_option.enable_log_info = option.pd_enable_log_info;
  pd_option.mkldnn_cache_size = option.pd_mkldnn_cache_size;
@@ -642,6 +621,7 @@ void Runtime::CreateOpenVINOBackend() {
 #ifdef ENABLE_OPENVINO_BACKEND
  auto ov_option = OpenVINOBackendOption();
  ov_option.cpu_thread_num = option.cpu_thread_num;
+  ov_option.ov_num_streams = option.ov_num_streams;
  FDASSERT(option.model_format == ModelFormat::PADDLE ||
               option.model_format == ModelFormat::ONNX,
           "OpenVINOBackend only support model format of ModelFormat::PADDLE / "
@@ -675,10 +655,6 @@ void Runtime::CreateOrtBackend() {
  ort_option.gpu_id = option.device_id;
  ort_option.external_stream_ = option.external_stream_;

-  // TODO(jiangjiajun): inside usage, maybe remove this later
-  ort_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
-  ort_option.custom_op_info_ = option.custom_op_info_;
-
  FDASSERT(option.model_format == ModelFormat::PADDLE ||
               option.model_format == ModelFormat::ONNX,
           "OrtBackend only support model format of ModelFormat::PADDLE / "
@@ -703,6 +679,9 @@ void Runtime::CreateOrtBackend() {
 void Runtime::CreateTrtBackend() {
 #ifdef ENABLE_TRT_BACKEND
  auto trt_option = TrtBackendOption();
+  trt_option.model_file = option.model_file;
+  trt_option.params_file = option.params_file;
+  trt_option.model_format = option.model_format;
  trt_option.gpu_id = option.device_id;
  trt_option.enable_fp16 = option.trt_enable_fp16;
  trt_option.enable_int8 = option.trt_enable_int8;
@@ -715,10 +694,6 @@ void Runtime::CreateTrtBackend() {
  trt_option.enable_pinned_memory = option.enable_pinned_memory;
  trt_option.external_stream_ = option.external_stream_;

-  // TODO(jiangjiajun): inside usage, maybe remove this later
-  trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
-  trt_option.custom_op_info_ = option.custom_op_info_;
-
  FDASSERT(option.model_format == ModelFormat::PADDLE ||
               option.model_format == ModelFormat::ONNX,
           "TrtBackend only support model format of ModelFormat::PADDLE / "
@@ -779,4 +754,26 @@ void Runtime::CreateRKNPU2Backend() {
 #endif
 }

+Runtime* Runtime::Clone(void* stream, int device_id) {
+  Runtime* runtime = new Runtime();
+  if (option.backend != Backend::OPENVINO
+      && option.backend != Backend::PDINFER
+      && option.backend != Backend::TRT
+      ) {
+    runtime->Init(option);
+    FDWARNING << "Only OpenVINO/Paddle Inference/TensorRT support \
+                  clone engine to  reduce CPU/GPU memory usage now. For "
+              << option.backend
+              << ", FastDeploy will create a new engine which \
+                  will not share memory  with the current runtime."
+              << std::endl;
+    return runtime;
+  }
+  FDINFO << "Runtime Clone with Backend:: " << Str(option.backend) << " in " << Str(option.device)
+         << "." << std::endl;
+  runtime->option = option;
+  runtime->backend_ = backend_->Clone(stream, device_id);
+  return runtime;
+}
+
 }  // namespace fastdeploy
@@ -35,38 +35,27 @@ namespace fastdeploy {

 /*! Inference backend supported in FastDeploy */
 enum Backend {
-  UNKNOWN, ///< Unknown inference backend
+  UNKNOWN,   ///< Unknown inference backend
  ORT,     ///< ONNX Runtime, support Paddle/ONNX format model, CPU / Nvidia GPU
-  TRT,     ///< TensorRT, support Paddle/ONNX format model, Nvidia GPU only
-  PDINFER, ///< Paddle Inference, support Paddle format model, CPU / Nvidia GPU
-  POROS,   ///< Poros, support TorchScript format model, CPU / Nvidia GPU
-  OPENVINO, ///< Intel OpenVINO, support Paddle/ONNX format, CPU only
+  TRT,      ///< TensorRT, support Paddle/ONNX format model, Nvidia GPU only
+  PDINFER,  ///< Paddle Inference, support Paddle format model, CPU / Nvidia GPU
+  POROS,    ///< Poros, support TorchScript format model, CPU / Nvidia GPU
+  OPENVINO,  ///< Intel OpenVINO, support Paddle/ONNX format, CPU only
  LITE,     ///< Paddle Lite, support Paddle format model, ARM CPU only
  RKNPU2,   ///< RKNPU2, support RKNN format model, Rockchip NPU only
 };

-/*! Deep learning model format */
-enum ModelFormat {
-  AUTOREC,     ///< Auto recognize the model format by model file name
-  PADDLE,      ///< Model with paddlepaddle format
-  ONNX,        ///< Model with ONNX format
-  RKNN,        ///< Model with RKNN format
-  TORCHSCRIPT, ///< Model with TorchScript format
-};
-
 FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
                                         const Backend& backend);
-FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
-                                         const ModelFormat& format);

 /*! Paddle Lite power mode for mobile device. */
 enum LitePowerMode {
-  LITE_POWER_HIGH = 0,      ///< Use Lite Backend with high power mode
-  LITE_POWER_LOW = 1,       ///< Use Lite Backend with low power mode
-  LITE_POWER_FULL = 2,      ///< Use Lite Backend with full power mode
-  LITE_POWER_NO_BIND = 3,   ///< Use Lite Backend with no bind power mode
-  LITE_POWER_RAND_HIGH = 4, ///< Use Lite Backend with rand high mode
-  LITE_POWER_RAND_LOW = 5   ///< Use Lite Backend with rand low power mode
+  LITE_POWER_HIGH = 0,       ///< Use Lite Backend with high power mode
+  LITE_POWER_LOW = 1,        ///< Use Lite Backend with low power mode
+  LITE_POWER_FULL = 2,       ///< Use Lite Backend with full power mode
+  LITE_POWER_NO_BIND = 3,    ///< Use Lite Backend with no bind power mode
+  LITE_POWER_RAND_HIGH = 4,  ///< Use Lite Backend with rand high mode
+  LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
 };

 FASTDEPLOY_DECL std::string Str(const Backend& b);
@@ -105,8 +94,10 @@ struct FASTDEPLOY_DECL RuntimeOption {
  /// Use Nvidia GPU to inference
  void UseGpu(int gpu_id = 0);

-  void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name = fastdeploy::rknpu2::CpuName::RK3588,
-                 fastdeploy::rknpu2::CoreMask rknpu2_core = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);
+  void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name
+                             = fastdeploy::rknpu2::CpuName::RK3588,
+                 fastdeploy::rknpu2::CoreMask rknpu2_core
+                             = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);

  void SetExternalStream(void* external_stream);

@@ -242,6 +233,11 @@ struct FASTDEPLOY_DECL RuntimeOption {
   */
  void DisablePaddleTrtCollectShape();

+  /*
+   * @brief Set number of streams by the OpenVINO backends
+   */
+  void SetOpenVINOStreams(int num_streams);
+
  /** \Use Graphcore IPU to inference.
   *
   * \param[in] device_num the number of IPUs.
@@ -331,19 +327,19 @@ struct FASTDEPLOY_DECL RuntimeOption {
  int unconst_ops_thres = -1;
  std::string poros_file = "";

+  // ======Only for OpenVINO Backend=======
+  int ov_num_streams = 1;
+
  // ======Only for RKNPU2 Backend=======
-  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_ = fastdeploy::rknpu2::CpuName::RK3588;
-  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
+  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_
+            = fastdeploy::rknpu2::CpuName::RK3588;
+  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_
+            = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;

  std::string model_file = "";  // Path of model file
-  std::string params_file = ""; // Path of parameters file, can be empty
-  ModelFormat model_format = ModelFormat::AUTOREC; // format of input model
-
-  // inside parameters, only for inside usage
-  // remove multiclass_nms in Paddle2ONNX
-  bool remove_multiclass_nms_ = false;
-  // for Paddle2ONNX to export custom operators
-  std::map<std::string, std::string> custom_op_info_;
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC; 
 };

 /*! @brief Runtime object used to inference the loaded model on different devices
@@ -390,6 +386,14 @@ struct FASTDEPLOY_DECL Runtime {
   */
  std::vector<TensorInfo> GetOutputInfos();

+  /** \brief Clone new Runtime when multiple instances of the same model are created
+   *
+   * \param[in] stream CUDA Stream, defualt param is nullptr
+   * \return new Runtime* by this clone
+   */
+  Runtime* Clone(void* stream = nullptr,
+                 int device_id = -1);
+
  RuntimeOption option;

 private:
@@ -401,4 +405,4 @@ struct FASTDEPLOY_DECL Runtime {
  void CreateRKNPU2Backend();
  std::unique_ptr<BaseBackend> backend_;
 };
-} // namespace fastdeploy
+}  // namespace fastdeploy
@@ -51,6 +51,7 @@
 #include "fastdeploy/vision/ocr/ppocr/recognizer.h"
 #include "fastdeploy/vision/segmentation/ppseg/model.h"
 #include "fastdeploy/vision/tracking/pptracking/model.h"
+#include "fastdeploy/vision/headpose/contrib/fsanet.h"
 #endif

 #include "fastdeploy/vision/visualize/visualize.h"
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "fastdeploy/vision/classification/ppcls/model.h"
+
 #include "fastdeploy/vision/utils/utils.h"
 #include "yaml-cpp/yaml.h"

@@ -108,7 +109,7 @@ bool PaddleClasModel::Preprocess(Mat* mat, FDTensor* output) {
  int height = mat->Height();
  output->name = InputInfoOfRuntime(0).name;
  output->SetExternalData({1, channel, height, width}, FDDataType::FP32,
-                          mat->GetOpenCVMat()->ptr());
+                          mat->Data());
  return true;
 }

@@ -13,54 +13,45 @@
 // limitations under the License.

 #include "fastdeploy/vision/common/processors/base.h"
+#include "fastdeploy/vision/common/processors/proc_lib.h"
+
 #include "fastdeploy/utils/utils.h"

 namespace fastdeploy {
 namespace vision {

-ProcLib Processor::default_lib = ProcLib::DEFAULT;
-
 bool Processor::operator()(Mat* mat, ProcLib lib) {
-  // if default_lib is set
-  // then use default_lib
  ProcLib target = lib;
-  if (default_lib != ProcLib::DEFAULT) {
-    target = default_lib;
+  if (lib == ProcLib::DEFAULT) {
+    target = DefaultProcLib::default_lib;
  }
-
  if (target == ProcLib::FLYCV) {
 #ifdef ENABLE_FLYCV
-    if (mat->mat_type != ProcLib::FLYCV) {
-      if (mat->layout != Layout::HWC) {
-        FDERROR << "Cannot convert cv::Mat to fcv::Mat while layout is not HWC." << std::endl;
-      }
-      fcv::Mat fcv_mat = ConvertOpenCVMatToFalconCV(*(mat->GetOpenCVMat()));
-      mat->SetMat(fcv_mat);
-    }
-    return ImplByFalconCV(mat);
+    return ImplByFlyCV(mat);
 #else
-    FDASSERT(false, "FastDeploy didn't compile with FalconCV.");
+    FDASSERT(false, "FastDeploy didn't compile with FlyCV.");
 #endif
  }
+  // DEFAULT & OPENCV
  return ImplByOpenCV(mat);
 }

 void EnableFlyCV() {
 #ifdef ENABLE_FLYCV
-  Processor::default_lib = ProcLib::FLYCV;
+  DefaultProcLib::default_lib = ProcLib::FLYCV;
  FDINFO << "Will change to use image processing library "
-         << Processor::default_lib << std::endl;
+         << DefaultProcLib::default_lib << std::endl;
 #else
  FDWARNING << "FastDeploy didn't compile with FlyCV, "
-                "will fallback to use OpenCV instead."
+               "will fallback to use OpenCV instead."
            << std::endl;
 #endif
 }

 void DisableFlyCV() {
-  Processor::default_lib = ProcLib::OPENCV;
+  DefaultProcLib::default_lib = ProcLib::OPENCV;
  FDINFO << "Will change to use image processing library "
-         << Processor::default_lib << std::endl;
+         << DefaultProcLib::default_lib << std::endl;
 }

 }  // namespace vision
@@ -22,7 +22,9 @@
 namespace fastdeploy {
 namespace vision {

-/*! @brief Enable using FlyCV to process image while deploy vision models. Currently, FlyCV in only available on ARM(Linux aarch64/Android), so will fallback to using OpenCV in other platform
+/*! @brief Enable using FlyCV to process image while deploy vision models.
+ * Currently, FlyCV in only available on ARM(Linux aarch64/Android), so will
+ * fallback to using OpenCV in other platform
 */
 FASTDEPLOY_DECL void EnableFlyCV();

@@ -35,21 +37,17 @@ class FASTDEPLOY_DECL Processor {
  // all the function in `processor` will force to use
  // default_lib if this flag is set.
  // DEFAULT means this flag is not set
-  static ProcLib default_lib;
+  // static ProcLib default_lib;

  virtual std::string Name() = 0;

  virtual bool ImplByOpenCV(Mat* mat) = 0;

-  virtual bool ImplByFalconCV(Mat* mat) {
-    FDASSERT(false,
-             "%s is not implemented with FalconCV, please use OpenCV instead.",
-             Name().c_str());
-    return false;
+  virtual bool ImplByFlyCV(Mat* mat) {
+    return ImplByOpenCV(mat);
  }

-  virtual bool operator()(Mat* mat,
-                          ProcLib lib = ProcLib::OPENCV);
+  virtual bool operator()(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
 };

 }  // namespace vision
@@ -36,8 +36,8 @@ bool Cast::ImplByOpenCV(Mat* mat) {
 }

 #ifdef ENABLE_FLYCV
-bool Cast::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool Cast::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
  if (dtype_ == "float" && mat->Type() == FDDataType::FP32) {
    return true;
  }
@@ -46,18 +46,18 @@ bool Cast::ImplByFalconCV(Mat* mat) {
  }
  if (mat->layout != Layout::HWC) {
    FDERROR
-        << "While using Falcon to cast image, the image must be layout of HWC."
+        << "While using FlyCV to cast image, the image must be layout of HWC."
        << std::endl;
    return false;
  }
  if (dtype_ == "float") {
    fcv::Mat new_im;
-    auto fcv_type = CreateFalconCVDataType(FDDataType::FP32, im->channels());
+    auto fcv_type = CreateFlyCVDataType(FDDataType::FP32, im->channels());
    im->convert_to(new_im, fcv_type);
    mat->SetMat(new_im);
  } else if (dtype_ == "double") {
    fcv::Mat new_im;
-    auto fcv_type = CreateFalconCVDataType(FDDataType::FP64, im->channels());
+    auto fcv_type = CreateFlyCVDataType(FDDataType::FP64, im->channels());
    im->convert_to(new_im, fcv_type);
    mat->SetMat(new_im);
  } else {
@@ -24,15 +24,13 @@ class FASTDEPLOY_DECL Cast : public Processor {
  explicit Cast(const std::string& dtype = "float") : dtype_(dtype) {}
  bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
  std::string Name() { return "Cast"; }
  static bool Run(Mat* mat, const std::string& dtype,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);

-  std::string GetDtype() const {
-    return dtype_;
-  }
+  std::string GetDtype() const { return dtype_; }

 private:
  std::string dtype_;
@@ -36,8 +36,8 @@ bool CenterCrop::ImplByOpenCV(Mat* mat) {
 }

 #ifdef ENABLE_FLYCV
-bool CenterCrop::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool CenterCrop::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
  int height = static_cast<int>(im->height());
  int width = static_cast<int>(im->width());
  if (height < height_ || width < width_) {
@@ -62,5 +62,5 @@ bool CenterCrop::Run(Mat* mat, const int& width, const int& height,
  return c(mat, lib);
 }

-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
@@ -24,12 +24,12 @@ class FASTDEPLOY_DECL CenterCrop : public Processor {
  CenterCrop(int width, int height) : height_(height), width_(width) {}
  bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
  std::string Name() { return "CenterCrop"; }

  static bool Run(Mat* mat, const int& width, const int& height,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);

 private:
  int height_;
@@ -25,10 +25,11 @@ bool BGR2RGB::ImplByOpenCV(Mat* mat) {
 }

 #ifdef ENABLE_FLYCV
-bool BGR2RGB::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool BGR2RGB::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
  if (im->channels() != 3) {
-    FDERROR << "[BGR2RGB] The channel of input image must be 3, but not it's " << im->channels() << "." << std::endl;
+    FDERROR << "[BGR2RGB] The channel of input image must be 3, but not it's "
+            << im->channels() << "." << std::endl;
    return false;
  }
  fcv::Mat new_im;
@@ -47,10 +48,11 @@ bool RGB2BGR::ImplByOpenCV(Mat* mat) {
 }

 #ifdef ENABLE_FLYCV
-bool RGB2BGR::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool RGB2BGR::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
  if (im->channels() != 3) {
-    FDERROR << "[RGB2BGR] The channel of input image must be 3, but not it's " << im->channels() << "." << std::endl;
+    FDERROR << "[RGB2BGR] The channel of input image must be 3, but not it's "
+            << im->channels() << "." << std::endl;
    return false;
  }
  fcv::Mat new_im;
--- a/Show More
+++ b/Show More