From a231c9e7f3893f917d59c4ba58272200e12ffb6d Mon Sep 17 00:00:00 2001
From: yunyaoXYY <109218879+yunyaoXYY@users.noreply.github.com>
Date: Wed, 2 Nov 2022 20:29:29 +0800
Subject: [PATCH 01/18] [Quantization] Update quantized model deployment
 examples and update readme. (#377)

* Add PaddleOCR Support

* Add PaddleOCR Support

* Add PaddleOCRv3 Support

* Add PaddleOCRv3 Support

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Add PaddleOCRv3 Support

* Add PaddleOCRv3 Supports

* Add PaddleOCRv3 Suport

* Fix Rec diff

* Remove useless functions

* Remove useless comments

* Add PaddleOCRv2 Support

* Add PaddleOCRv3 & PaddleOCRv2 Support

* remove useless parameters

* Add utils of sorting det boxes

* Fix code naming convention

* Fix code naming convention

* Fix code naming convention

* Fix bug in the Classify process

* Imporve OCR Readme

* Fix diff in Cls model

* Update Model Download Link in Readme

* Fix diff in PPOCRv2

* Improve OCR readme

* Imporve OCR readme

* Improve OCR readme

* Improve OCR readme

* Imporve OCR readme

* Improve OCR readme

* Fix conflict

* Add readme for OCRResult

* Improve OCR readme

* Add OCRResult readme

* Improve OCR readme

* Improve OCR readme

* Add Model Quantization Demo

* Fix Model Quantization Readme

* Fix Model Quantization Readme

* Add the function to do PTQ quantization

* Improve quant tools readme

* Improve quant tool readme

* Improve quant tool readme

* Add PaddleInference-GPU for OCR Rec model

* Add QAT method to fastdeploy-quantization tool

* Remove examples/slim for now

* Move configs folder

* Add Quantization Support for Classification Model

* Imporve ways of importing preprocess

* Upload YOLO Benchmark on readme

* Upload YOLO Benchmark on readme

* Upload YOLO Benchmark on readme

* Improve Quantization configs and readme

* Add support for multi-inputs model

* Add backends and params file for YOLOv7

* Add quantized model deployment support for YOLO series

* Fix YOLOv5 quantize readme

* Fix YOLO quantize readme

* Fix YOLO quantize readme

* Improve quantize YOLO readme

* Improve quantize YOLO readme

* Improve quantize YOLO readme

* Improve quantize YOLO readme

* Improve quantize YOLO readme

* Fix bug, change Fronted to ModelFormat

* Change Fronted to ModelFormat

* Add examples to deploy quantized paddleclas models

* Fix readme

* Add quantize Readme

* Add quantize Readme

* Add quantize Readme

* Modify readme of quantization tools

* Modify readme of quantization tools

* Improve quantization tools readme

* Improve quantization readme

* Improve PaddleClas quantized model deployment  readme

* Add PPYOLOE-l quantized deployment examples

* Improve quantization tools readme

* Improve Quantize Readme

* Fix conflicts

* Fix conflicts

* improve readme

* Improve quantization tools and readme

* Improve quantization tools and readme

* Add quantized deployment examples for PaddleSeg model

* Fix cpp readme

* Fix memory leak of reader_wrapper function

* Fix model file name in PaddleClas quantization examples

* Update Runtime and E2E benchmark

* Update Runtime and E2E benchmark

* Rename quantization tools to auto compression tools

* Remove PPYOLOE data when deployed on MKLDNN

* Fix readme

* Support PPYOLOE with OR without NMS and update readme

* Update Readme

* Update configs and readme

* Update configs and readme

* Add Paddle-TensorRT backend in quantized model deploy examples

* Support PPYOLOE+ series
---
 docs/cn/quantize.md                           | 145 +++++--
 docs/en/quantize.md                           |  74 +++-
 .../paddleclas/quantize/README.md             |  53 ++-
 .../paddleclas/quantize/cpp/README.md         |   8 +-
 .../paddleclas/quantize/cpp/infer.cc          |  10 +-
 .../paddleclas/quantize/cpp/ocr.sh            |  10 +
 .../paddleclas/quantize/python/README.md      |   6 +-
 .../paddleclas/quantize/python/infer.py       |   5 +
 .../paddledetection/quantize/README.md        |  49 ++-
 .../paddledetection/quantize/cpp/README.md    |   4 +-
 .../quantize/cpp/infer_ppyoloe.cc             |   8 +
 .../paddledetection/quantize/python/README.md |   4 +-
 .../quantize/python/infer_ppyoloe.py          |   5 +
 .../detection/yolov5/quantize/README.md       |  48 ++-
 .../detection/yolov5/quantize/cpp/README.md   |   4 +-
 .../detection/yolov5/quantize/cpp/infer.cc    |   6 +-
 .../yolov5/quantize/python/README.md          |   4 +-
 .../detection/yolov5/quantize/python/infer.py |   5 +
 .../detection/yolov6/quantize/README.md       |  49 ++-
 .../detection/yolov6/quantize/cpp/README.md   |  12 +-
 .../detection/yolov6/quantize/cpp/infer.cc    |   6 +-
 .../yolov6/quantize/python/README.md          |  12 +-
 .../detection/yolov6/quantize/python/infer.py |   5 +
 .../detection/yolov7/quantize/README.md       |  45 +-
 .../detection/yolov7/quantize/cpp/README.md   |   4 +-
 .../detection/yolov7/quantize/cpp/infer.cc    |   6 +-
 .../yolov7/quantize/python/README.md          |   4 +-
 .../detection/yolov7/quantize/python/infer.py |   5 +
 .../segmentation/paddleseg/quantize/README.md |  36 ++
 .../paddleseg/quantize/cpp/CMakeLists.txt     |  14 +
 .../paddleseg/quantize/cpp/README.md          |  30 ++
 .../paddleseg/quantize/cpp/infer.cc           | 100 +++++
 .../paddleseg/quantize/python/README.md       |  28 ++
 .../paddleseg/quantize/python/infer.py        |  76 ++++
 tools/auto_compression/README.md              | 129 ++++++
 tools/auto_compression/configs/README.md      |  54 +++
 .../mobilenetv1_ssld_quant.yaml               |   9 +-
 .../classification/resnet50_vd_quant.yaml     |   5 +-
 .../detection/ppyoloe_plus_withNMS_quant.yaml |  39 ++
 .../detection/ppyoloe_withNMS_quant.yaml}     |  10 +-
 .../configs/detection/yolov5s_quant.yaml      |   8 +-
 .../configs/detection/yolov6s_quant.yaml      |   7 +-
 .../configs/detection/yolov7_quant.yaml       |   7 +-
 .../segmentation/pp_liteseg_quant.yaml        |  37 ++
 .../fd_auto_compress}/__init__.py             |   0
 .../fd_auto_compress/dataset.py               | 388 ++++++++++++++++++
 .../fd_auto_compress/fd_auto_compress.py}     | 102 +++--
 .../requirements.txt                          |   0
 tools/auto_compression/setup.py               |  26 ++
 tools/quantization/README.md                  | 108 -----
 tools/quantization/configs/README.md          |  51 ---
 tools/quantization/fdquant/dataset.py         | 150 -------
 tools/quantization/setup.py                   |  25 --
 53 files changed, 1514 insertions(+), 521 deletions(-)
 create mode 100644 examples/vision/classification/paddleclas/quantize/cpp/ocr.sh
 create mode 100644 examples/vision/segmentation/paddleseg/quantize/README.md
 create mode 100644 examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt
 create mode 100644 examples/vision/segmentation/paddleseg/quantize/cpp/README.md
 create mode 100644 examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc
 create mode 100644 examples/vision/segmentation/paddleseg/quantize/python/README.md
 create mode 100644 examples/vision/segmentation/paddleseg/quantize/python/infer.py
 create mode 100644 tools/auto_compression/README.md
 create mode 100644 tools/auto_compression/configs/README.md
 rename tools/{quantization => auto_compression}/configs/classification/mobilenetv1_ssld_quant.yaml (83%)
 rename tools/{quantization => auto_compression}/configs/classification/resnet50_vd_quant.yaml (90%)
 create mode 100644 tools/auto_compression/configs/detection/ppyoloe_plus_withNMS_quant.yaml
 rename tools/{quantization/configs/detection/ppyoloe_l_quant.yaml => auto_compression/configs/detection/ppyoloe_withNMS_quant.yaml} (73%)
 rename tools/{quantization => auto_compression}/configs/detection/yolov5s_quant.yaml (78%)
 rename tools/{quantization => auto_compression}/configs/detection/yolov6s_quant.yaml (81%)
 rename tools/{quantization => auto_compression}/configs/detection/yolov7_quant.yaml (79%)
 create mode 100644 tools/auto_compression/configs/segmentation/pp_liteseg_quant.yaml
 rename tools/{quantization/fdquant => auto_compression/fd_auto_compress}/__init__.py (100%)
 create mode 100644 tools/auto_compression/fd_auto_compress/dataset.py
 rename tools/{quantization/fdquant/fdquant.py => auto_compression/fd_auto_compress/fd_auto_compress.py} (59%)
 rename tools/{quantization => auto_compression}/requirements.txt (100%)
 create mode 100644 tools/auto_compression/setup.py
 delete mode 100644 tools/quantization/README.md
 delete mode 100644 tools/quantization/configs/README.md
 delete mode 100644 tools/quantization/fdquant/dataset.py
 delete mode 100644 tools/quantization/setup.py

diff --git a/docs/cn/quantize.md b/docs/cn/quantize.md
index 6277c8385c..7717176f63 100644
--- a/docs/cn/quantize.md
+++ b/docs/cn/quantize.md
@@ -2,22 +2,22 @@
 
 # 量化加速
 量化是一种流行的模型压缩方法，量化后的模型拥有更小的体积和更快的推理速度.
-FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDeploy支持推理部署量化后的模型, 帮助用户实现推理加速.
+FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具. FastDeploy一键模型自动压缩可包含多种策略, 目前主要采用离线量化和量化蒸馏训练.同时, FastDeploy支持部署压缩后的模型, 帮助用户实现推理加速. 本文主要描述量化模型在FastDeploy上的部署.
 
 
 ## FastDeploy 多个引擎和硬件支持量化模型部署
 当前，FastDeploy中多个推理后端可以在不同硬件上支持量化模型的部署. 支持情况如下:
 
-| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT |
-| :-----------| :--------   | :--------------- | :------- |
-|   CPU       |  支持        |  支持            |          |  
-|   GPU       |             |                  | 支持      |
+| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT | Paddle-TensorRT |
+| :-----------| :--------   | :--------------- | :------- | :------- |
+|   CPU       |  支持        |  支持            |          |           |  
+|   GPU       |             |                  | 支持      |    支持        |
 
 
 ## 模型量化
 
 ### 量化方法
-基于PaddleSlim，目前FastDeploy提供的的量化方法有量化蒸馏训练和离线量化，量化蒸馏训练通过模型训练来获得量化模型，离线量化不需要模型训练即可完成模型的量化。 FastDeploy 对两种方式产出的量化模型均能部署。
+基于PaddleSlim，目前FastDeploy一键模型自动压缩提供的的量化方法有量化蒸馏训练和离线量化，量化蒸馏训练通过模型训练来获得量化模型，离线量化不需要模型训练即可完成模型的量化。 FastDeploy 对两种方式产出的量化模型均能部署。
 
 两种方法的主要对比如下表所示:
 | 量化方法 | 量化过程耗时 | 量化模型精度 | 模型体积 | 推理速度 |
@@ -25,44 +25,115 @@ FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDe
 |   离线量化      |  无需训练，耗时短 |  比量化蒸馏训练稍低       | 两者一致   | 两者一致   |  
 |   量化蒸馏训练      |  需要训练，耗时稍高 |  较未量化模型有少量损失 | 两者一致   |两者一致   |  
 
-### 使用FastDeploy一键模型量化工具来量化模型
-Fastdeploy基于PaddleSlim, 为用户提供了一键模型量化的工具，请参考如下文档进行模型量化。
-- [FastDeploy 一键模型量化](../../tools/quantization/)
-当用户获得产出的量化模型之后，即可以使用FastDeploy来部署量化模型。
+### 使用FastDeploy一键模型自动化压缩工具来量化模型
+FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具，请参考如下文档进行一键模型自动化压缩。
+- [FastDeploy 一键模型自动化压缩](../../tools/auto_compression/)
+当用户获得产出的压缩模型之后，即可以使用FastDeploy来部署压缩模型。
 
 
-## 量化benchmark
+## 量化模型 Benchmark
 
-目前, FastDeploy已支持的模型量化如下表所示:
+目前, FastDeploy支持自动化压缩,并完成部署测试的模型的Runtime Benchmark和端到端Benchmark如下所示.
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
 
 ### YOLO 系列
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT         |    GPU    |  14.13        |  11.22      |      1.26         | 37.6  | 36.6 | 量化蒸馏训练 |
-| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime     |    CPU    |  183.68       |    100.39   |      1.83         | 37.6  | 33.1 |量化蒸馏训练 |
-| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference  |    CPU    |      226.36   |   152.27     |      1.48         |37.6 | 36.8 | 量化蒸馏训练 |
-| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT         |    GPU    |       12.89        |   8.92          |  1.45             | 42.5 | 40.6|量化蒸馏训练 |
-| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | ONNX Runtime     |    CPU    |   345.85            |  131.81           |      2.60         |42.5| 36.1|量化蒸馏训练 |
-| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |         366.41      |    131.70         |     2.78          |42.5| 41.2|量化蒸馏训练 |
-| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT          |    GPU    |     30.43          |      15.40       |       1.98        | 51.1| 50.8|量化蒸馏训练 |
-| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     971.27          |  471.88           |  2.06             | 51.1 | 42.5|量化蒸馏训练 |
-| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |          1015.70     |      562.41       |    1.82           |51.1 | 46.3|量化蒸馏训练 |
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT   |    GPU    |  7.87    | 4.51 |  4.31     | 3.17     |      2.48         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | Paddle-TensorRT  |    GPU   |  7.99    |  None |  4.46    | 3.31     |      2.41         | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime   |    CPU    |  176.41      |    91.90   |  None |  None |      1.90        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference|    CPU    |      213.73  |   130.19     |  None  | None |   1.64     |37.6 | 35.2 | 量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT  |    GPU    |       9.47    |   3.23    |  4.09      |2.81    |  3.37            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | Paddle-TensorRT |    GPU    |       9.31    | None|  4.17  | 2.95       |  3.16            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)          | ONNX Runtime     |    CPU    |   334.65     |  126.38      | None | None|     2.65   |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |    352.87   |    123.12    |None | None|     2.87         |42.5| 40.8|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT   |    GPU    |     27.47    |  6.52   |  6.74| 5.19|    5.29       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | Paddle-TensorRT |    GPU    |     27.87|None|6.91|5.86      |      4.76       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     996.65        |  467.15 |None|None          |  2.13           | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |     995.85  |     477.93|None|None      |   2.08         |51.1 | 46.2|量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT   |    GPU    |  24.61   | 21.20 |  20.78     | 20.94     |      1.18         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | Paddle-TensorRT  |    GPU   |  23.53    |  None |  21.98    | 19.84     |      1.28        | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime   |    CPU    |  197.323      |    110.99   |  None |  None |      1.78        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference|    CPU    |      235.73  |   144.82     |  None  | None |   1.63     |37.6 | 35.2 | 量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT  |    GPU    |       15.66    |   11.30   |  10.25      |9.59   |  1.63           | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | Paddle-TensorRT |    GPU    |       15.03   | None|  11.36 | 9.32       |  1.61            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)          | ONNX Runtime     |    CPU    |   348.21    |  126.38      | None | None| 2.82       |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |    352.87   |    121.64    |None | None|    3.04       |42.5| 40.8|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT   |    GPU    |     36.47   |  18.81  |  20.33| 17.58|    2.07      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | Paddle-TensorRT |    GPU    |     37.06|None|20.26|17.53    |      2.11      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     988.85       |  478.08 |None|None          |  2.07          | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |     1031.73 |     500.12|None|None      |   2.06         |51.1 | 46.2|量化蒸馏训练 |
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试数据为COCO2017验证集中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
 
 
 ### PaddleClas系列
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 Top1 | INT8 Top1 |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime         |    CPU    |  86.87        |  59 .32     |      1.46         | 79.12  | 78.87|  离线量化|
-| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  7.85        |  5.42      |      1.45         | 79.12  | 79.06 | 离线量化 |
-| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | ONNX Runtime |    CPU    |      40.32   |   16.87     |      2.39         |77.89 | 75.09 |离线量化 |
-| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | TensorRT  |    GPU    |      5.10   |   3.35     |      1.52         |77.89 | 76.86 | 离线量化 |
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  3.55 | 0.99|0.98|1.06  |      3.62      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle-TensorRT  |    GPU    |  3.46 |None |0.87|1.03  |      3.98      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime    |    CPU    |  76.14       |  35.43  |None|None  |     2.15        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle Inference  |    CPU    |  76.21       |  24.01 |None|None  |     3.17       | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | TensorRT  |    GPU    |     0.91 |   0.43 |0.49 | 0.54    |      2.12       |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | Paddle-TensorRT   |    GPU    |  0.88|   None| 0.49|0.51 |      1.80      |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | ONNX Runtime |    CPU    |     30.53   |   9.59|None|None    |     3.18       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        |  Paddle Inference  |    CPU    |     12.29  |   4.68  |     None|None|2.62       |77.89 | 71.36 |离线量化 |
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试数据为ImageNet-2012验证集中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  4.92| 2.28|2.24|2.23 |      2.21     | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle-TensorRT  |    GPU    |  4.48|None |2.09|2.10 |      2.14   | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime    |    CPU    |  77.43    |  41.90 |None|None  |     1.85        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | Paddle Inference  |    CPU    |   80.60     |  27.75 |None|None  |     2.90     | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | TensorRT  |    GPU    |     2.19 |   1.48|1.57| 1.57   |      1.48     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | Paddle-TensorRT   |    GPU    |  2.04|   None| 1.47|1.45 |      1.41     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        | ONNX Runtime |    CPU    |     34.02  |   12.97|None|None    |    2.62       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)        |  Paddle Inference  |    CPU    |    16.31 |   7.42  |     None|None| 2.20      |77.89 | 71.36 |离线量化 |
+
+
+
+### PaddleDetection系列
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  27.90 | 6.39 |6.44|5.95    |      4.67       | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | Paddle-TensorRT |    GPU    |  30.89     |None  |  13.78 |14.01    |      2.24       | 51.4  | 50.5| 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize)  | ONNX Runtime |    CPU    |     1057.82 |   449.52 |None|None    |      2.35        |51.4 | 50.0 |量化蒸馏训练 |
+
+NOTE:
+- TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  35.75 | 15.42 |20.70|20.85  |      2.32      | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | Paddle-TensorRT |    GPU    | 33.48    |None  |  18.47 |18.03   |     1.81       | 51.4  | 50.5| 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize)  | ONNX Runtime |    CPU    |     1067.17 |   461.037 |None|None    |      2.31        |51.4 | 50.0 |量化蒸馏训练 |
+
+
+
+### PaddleSeg系列
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize)  | Paddle Inference |    CPU    |     1138.04|   602.62 |None|None     |      1.89      |77.37 | 71.62 |量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize)  | Paddle Inference |    CPU    |     4726.65|   4134.91|None|None     |      1.14      |77.37 | 71.62 |量化蒸馏训练 |
diff --git a/docs/en/quantize.md b/docs/en/quantize.md
index eb626c6e6c..effce0700b 100644
--- a/docs/en/quantize.md
+++ b/docs/en/quantize.md
@@ -1,11 +1,79 @@
 [English](../en/quantize.md) | 简体中文
 
 # 量化加速
+量化是一种流行的模型压缩方法，量化后的模型拥有更小的体积和更快的推理速度.
+FastDeploy基于PaddleSlim, 集成了一键模型量化的工具, 同时, FastDeploy支持部署量化后的模型, 帮助用户实现推理加速.
 
-简要介绍量化加速的原理。
 
-目前量化支持在哪些硬件及后端的使用
+## FastDeploy 多个引擎和硬件支持量化模型部署
+当前，FastDeploy中多个推理后端可以在不同硬件上支持量化模型的部署. 支持情况如下:
+
+| 硬件/推理后端 | ONNX Runtime | Paddle Inference | TensorRT |
+| :-----------| :--------   | :--------------- | :------- |
+|   CPU       |  支持        |  支持            |          |  
+|   GPU       |             |                  | 支持      |
+
+
+## 模型量化
+
+### 量化方法
+基于PaddleSlim, 目前FastDeploy提供的的量化方法有量化蒸馏训练和离线量化, 量化蒸馏训练通过模型训练来获得量化模型, 离线量化不需要模型训练即可完成模型的量化. FastDeploy 对两种方式产出的量化模型均能部署.
+
+两种方法的主要对比如下表所示:
+| 量化方法 | 量化过程耗时 | 量化模型精度 | 模型体积 | 推理速度 |
+| :-----------| :--------| :-------| :------- | :------- |
+|   离线量化      |  无需训练，耗时短 |  比量化蒸馏训练稍低       | 两者一致   | 两者一致   |  
+|   量化蒸馏训练      |  需要训练，耗时稍高 |  较未量化模型有少量损失 | 两者一致   |两者一致   |  
+
+### 用户使用FastDeploy一键模型量化工具来量化模型
+Fastdeploy基于PaddleSlim, 为用户提供了一键模型量化的工具，请参考如下文档进行模型量化.
+- [FastDeploy 一键模型量化](../../tools/quantization/)
+当用户获得产出的量化模型之后，即可以使用FastDeploy来部署量化模型.
+
 
 ## 量化示例
+目前, FastDeploy已支持的模型量化如下表所示:
 
-这里一个表格，展示目前支持的量化列表(跳转到相应的example下去)，精度、性能
+### YOLO 系列
+| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)             | TensorRT         |    GPU    |  8.79       |  5.17     |      1.70         | 37.6  | 36.6 | 量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | ONNX Runtime     |    CPU    |  176.34      |    92.95   |      1.90        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](../../examples/vision/detection/yolov5/quantize/)              | Paddle Inference  |    CPU    |      217.05  |   133.31     |     1.63         |37.6 | 36.8 | 量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | TensorRT         |    GPU    |       8.60       |   5.16         |  1.67            | 42.5 | 40.6|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)            | ONNX Runtime     |    CPU    |   338.60           |  128.58          |      2.60         |42.5| 36.1|量化蒸馏训练 |
+| [YOLOv6s](../../examples/vision/detection/yolov6/quantize/)             | Paddle Inference  |    CPU    |        356.62     |    125.72        |     2.84         |42.5| 41.2|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)            | TensorRT          |    GPU    |     24.57         |      9.40     |      2.61       | 51.1| 50.8|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | ONNX Runtime     |    CPU    |     976.88         |  462.69          |  2.11            | 51.1 | 42.5|量化蒸馏训练 |
+| [YOLOv7](../../examples/vision/detection/yolov7/quantize/)             | Paddle Inference  |    CPU    |         1022.55    |     490.87      |   2.08         |51.1 | 46.3|量化蒸馏训练 |
+
+上表中的数据, 为模型量化前后，在FastDeploy部署的Runtime推理性能.
+- 测试数据为COCO2017验证集中的图片.
+- 推理时延为在不同Runtime上推理的时延, 单位是毫秒.
+- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+
+
+### PaddleDetection系列
+| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP |量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize )  | TensorRT         |    GPU    |  24.52       |  11.53    |      2.13        | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize)  | ONNX Runtime |    CPU    |     1085.62 |   457.56     |      2.37        |51.4 | 50.0 |量化蒸馏训练 |
+
+上表中的数据, 为模型量化前后，在FastDeploy部署的Runtime推理性能.
+- 测试图片为COCO val2017中的图片.
+- 推理时延为在不同Runtime上推理的时延, 单位是毫秒.
+- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+
+
+### PaddleClas系列
+| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 Top1 | INT8 Top1 |量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | ONNX Runtime         |    CPU    |  77.20       |  40.08     |     1.93        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/)            | TensorRT         |    GPU    |  3.70        | 1.80      |      2.06      | 79.12  | 79.06 | 离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | ONNX Runtime |    CPU    |     30.99   |   10.24    |     3.03        |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/)             | TensorRT  |    GPU    |     1.80  |   0.58    |      3.10       |77.89 | 76.86 | 离线量化 |
+
+上表中的数据, 为模型量化前后，在FastDeploy部署的Runtime推理性能.
+- 测试数据为ImageNet-2012验证集中的图片.
+- 推理时延为在不同Runtime上推理的时延, 单位是毫秒.
+- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
diff --git a/examples/vision/classification/paddleclas/quantize/README.md b/examples/vision/classification/paddleclas/quantize/README.md
index 3a100a823a..6e3f78b4d5 100644
--- a/examples/vision/classification/paddleclas/quantize/README.md
+++ b/examples/vision/classification/paddleclas/quantize/README.md
@@ -1,25 +1,48 @@
 # PaddleClas 量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
 
-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。
 
 ## 下载量化完成的PaddleClas模型
 用户也可以直接下载下表中的量化模型进行部署.
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 Top1 | INT8 Top1 |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | ONNX Runtime         |    CPU    |  86.87        |  59 .32     |      1.46         | 79.12  | 78.87|  离线量化|
-| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  7.85        |  5.42      |      1.45         | 79.12  | 79.06 | 离线量化 |
-| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)             | ONNX Runtime |    CPU    |      40.32   |   16.87     |      2.39         |77.89 | 75.09 |离线量化 |
-| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)             | TensorRT  |    GPU    |      5.10   |   3.35     |      1.52         |77.89 | 76.86 | 离线量化 |
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试图片为ImageNet-2012验证集中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  3.55 | 0.99|0.98|1.06  |      3.62      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle-TensorRT  |    GPU    |  3.46 |None |0.87|1.03  |      3.98      | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | ONNX Runtime    |    CPU    |  76.14       |  35.43  |None|None  |     2.15        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle Inference  |    CPU    |  76.21       |  24.01 |None|None  |     3.17       | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | TensorRT  |    GPU    |     0.91 |   0.43 |0.49 | 0.54    |      2.12       |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | Paddle-TensorRT   |    GPU    |  0.88|   None| 0.49|0.51 |      1.80      |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | ONNX Runtime |    CPU    |     30.53   |   9.59|None|None    |     3.18       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        |  Paddle Inference  |    CPU    |     12.29  |   4.68  |     None|None|2.62       |77.89 | 71.36 |离线量化 |
+
+### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 Top1 | INT8 Top1 | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | TensorRT         |    GPU    |  4.92| 2.28|2.24|2.23 |      2.21     | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle-TensorRT  |    GPU    |  4.48|None |2.09|2.10 |      2.14   | 79.12  | 79.06 | 离线量化 |
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | ONNX Runtime    |    CPU    |  77.43    |  41.90 |None|None  |     1.85        | 79.12  | 78.87|  离线量化|
+| [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar)            | Paddle Inference  |    CPU    |   80.60     |  27.75 |None|None  |     2.90     | 79.12  | 78.55 |  离线量化|
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | TensorRT  |    GPU    |     2.19 |   1.48|1.57| 1.57   |      1.48     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | Paddle-TensorRT   |    GPU    |  2.04|   None| 1.47|1.45 |      1.41     |77.89 | 76.86 | 离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        | ONNX Runtime |    CPU    |     34.02  |   12.97|None|None    |    2.62       |77.89 | 75.09 |离线量化 |
+| [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar)        |  Paddle Inference  |    CPU    |    16.31 |   7.42  |     None|None| 2.20      |77.89 | 71.36 |离线量化 |
 
 ## 详细部署文档
 
diff --git a/examples/vision/classification/paddleclas/quantize/cpp/README.md b/examples/vision/classification/paddleclas/quantize/cpp/README.md
index 0f9bfc8f32..e2e625dbd5 100644
--- a/examples/vision/classification/paddleclas/quantize/cpp/README.md
+++ b/examples/vision/classification/paddleclas/quantize/cpp/README.md
@@ -1,4 +1,4 @@
-# PaddleClas 量化模型 Python部署示例
+# PaddleClas 量化模型 C++部署示例
 本目录下提供的`infer.cc`,可以帮助用户快速完成PaddleClas量化模型在CPU/GPU上的部署推理加速.
 
 ## 部署准备
@@ -8,7 +8,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../../../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
 
 ## 以量化后的ResNet50_Vd模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -26,8 +26,10 @@ tar -xvf resnet50_vd_ptq.tar
 wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
 
 
-# 在CPU上使用Paddle-Inference推理量化模型
+# 在CPU上使用ONNX Runtime推理量化模型
 ./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo resnet50_vd_ptq ILSVRC2012_val_00000010.jpeg 2
 ```
diff --git a/examples/vision/classification/paddleclas/quantize/cpp/infer.cc b/examples/vision/classification/paddleclas/quantize/cpp/infer.cc
index ed4f05a243..b0a774feb4 100644
--- a/examples/vision/classification/paddleclas/quantize/cpp/infer.cc
+++ b/examples/vision/classification/paddleclas/quantize/cpp/infer.cc
@@ -21,8 +21,8 @@ const char sep = '/';
 
 void InitAndInfer(const std::string& model_dir, const std::string& image_file,
                   const fastdeploy::RuntimeOption& option) {
-  auto model_file = model_dir + sep + "inference.pdmodel";
-  auto params_file = model_dir + sep + "inference.pdiparams";
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
   auto config_file = model_dir + sep + "inference_cls.yaml";
 
   auto model = fastdeploy::vision::classification::PaddleClasModel(
@@ -67,7 +67,11 @@ int main(int argc, char* argv[]) {
     option.UseGpu();
     option.UseTrtBackend();
     option.SetTrtInputShape("inputs",{1, 3, 224, 224});
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }
 
   std::string model_dir = argv[1];
   std::string test_image = argv[2];
diff --git a/examples/vision/classification/paddleclas/quantize/cpp/ocr.sh b/examples/vision/classification/paddleclas/quantize/cpp/ocr.sh
new file mode 100644
index 0000000000..90ad6a9e33
--- /dev/null
+++ b/examples/vision/classification/paddleclas/quantize/cpp/ocr.sh
@@ -0,0 +1,10 @@
+rm -rf build
+mkdir build
+
+cd build
+
+#/xieyunyao/project/FastDeploy
+
+cmake .. -DFASTDEPLOY_INSTALL_DIR=/xieyunyao/project/FastDeploy
+
+make -j
diff --git a/examples/vision/classification/paddleclas/quantize/python/README.md b/examples/vision/classification/paddleclas/quantize/python/README.md
index 5da97d48a0..00fd7bef9c 100644
--- a/examples/vision/classification/paddleclas/quantize/python/README.md
+++ b/examples/vision/classification/paddleclas/quantize/python/README.md
@@ -8,7 +8,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的inference_cls.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
 
 
 ## 以量化后的ResNet50_Vd模型为例, 进行部署
@@ -22,8 +22,10 @@ wget https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar
 tar -xvf resnet50_vd_ptq.tar
 wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
 
-# 在CPU上使用Paddle-Inference推理量化模型
+# 在CPU上使用ONNX Runtime推理量化模型
 python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device cpu --backend ort
 # 在GPU上使用TensorRT推理量化模型
 python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model resnet50_vd_ptq --image ILSVRC2012_val_00000010.jpeg --device gpu --backend pptrt
 ```
diff --git a/examples/vision/classification/paddleclas/quantize/python/infer.py b/examples/vision/classification/paddleclas/quantize/python/infer.py
index 0a4df17680..e981744bdb 100644
--- a/examples/vision/classification/paddleclas/quantize/python/infer.py
+++ b/examples/vision/classification/paddleclas/quantize/python/infer.py
@@ -48,6 +48,11 @@ def build_option(args):
         ) == "gpu", "TensorRT backend require inferences on device GPU."
         option.use_trt_backend()
         option.set_trt_input_shape("inputs", min_shape=[1, 3, 224, 224])
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
     elif args.backend.lower() == "ort":
         option.use_ort_backend()
     elif args.backend.lower() == "paddle":
diff --git a/examples/vision/detection/paddledetection/quantize/README.md b/examples/vision/detection/paddledetection/quantize/README.md
index f3e87e70d9..8c6f1feeef 100644
--- a/examples/vision/detection/paddledetection/quantize/README.md
+++ b/examples/vision/detection/paddledetection/quantize/README.md
@@ -1,22 +1,43 @@
 # PaddleDetection 量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
 
-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 
 ## 下载量化完成的PP-YOLOE-l模型
-用户也可以直接下载下表中的量化模型进行部署.
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  43.83        |  31.57      |      1.39         | 51.4  | 50.7 | 量化蒸馏训练 |
-| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | ONNX Runtime |    CPU    |      1085.18  |   475.55     |      2.29         |51.4 | 50.0 |量化蒸馏训练 |
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  27.90 | 6.39 |6.44|5.95    |      4.67       | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | Paddle-TensorRT |    GPU    |  30.89     |None  |  13.78 |14.01    |      2.24       | 51.4  | 50.5 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar)  | ONNX Runtime |    CPU    |     1057.82 |   449.52 |None|None    |      2.35        |51.4 | 50.0 |量化蒸馏训练 |
+
+NOTE:
+- TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | TensorRT         |    GPU    |  35.75 | 15.42 |20.70|20.85  |      2.32      | 51.4  | 50.7 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar )  | Paddle-TensorRT |    GPU    | 33.48    |None  |  18.47 |18.03   |     1.81       | 51.4  | 50.5 | 量化蒸馏训练 |
+| [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar)  | ONNX Runtime |    CPU    |     1067.17 |   461.037 |None|None    |      2.31        |51.4 | 50.0 |量化蒸馏训练 |
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试图片为COCO val2017中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
 
 ## 详细部署文档
 
diff --git a/examples/vision/detection/paddledetection/quantize/cpp/README.md b/examples/vision/detection/paddledetection/quantize/cpp/README.md
index 034957ffd5..42bf40acbe 100644
--- a/examples/vision/detection/paddledetection/quantize/cpp/README.md
+++ b/examples/vision/detection/paddledetection/quantize/cpp/README.md
@@ -9,7 +9,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
 
 ## 以量化后的PP-YOLOE-l模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -30,4 +30,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_ppyoloe_demo ppyoloe_crn_l_300e_coco_qat 000000014439.jpg 2
 ```
diff --git a/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc b/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc
index 9ed06b5756..4d2abd3fc0 100644
--- a/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc
+++ b/examples/vision/detection/paddledetection/quantize/cpp/infer_ppyoloe.cc
@@ -71,7 +71,15 @@ int main(int argc, char* argv[]) {
     option.UseTrtBackend();
     option.SetTrtInputShape("inputs",{1, 3, 640, 640});
     option.SetTrtInputShape("scale_factor",{1,2});
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
   }
+    else if (flag == 3) {
+    option.UseCpu();
+    option.UsePaddleBackend();
+    }
 
   std::string model_dir = argv[1];
   std::string test_image = argv[2];
diff --git a/examples/vision/detection/paddledetection/quantize/python/README.md b/examples/vision/detection/paddledetection/quantize/python/README.md
index 9535df5c3e..cecb5a1401 100644
--- a/examples/vision/detection/paddledetection/quantize/python/README.md
+++ b/examples/vision/detection/paddledetection/quantize/python/README.md
@@ -8,7 +8,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的infer_cfg.yml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
 
 
 ## 以量化后的PP-YOLOE-l模型为例, 进行部署
@@ -26,4 +26,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device cpu --backend ort
 # 在GPU上使用TensorRT推理量化模型
 python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer_ppyoloe.py --model ppyoloe_crn_l_300e_coco_qat --image 000000014439.jpg --device gpu --backend pptrt
 ```
diff --git a/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py b/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py
index 85f3c9d551..59e602f6e6 100644
--- a/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py
+++ b/examples/vision/detection/paddledetection/quantize/python/infer_ppyoloe.py
@@ -49,6 +49,11 @@ def build_option(args):
         option.set_trt_cache_file(os.path.join(args.model, "model.trt"))
         option.set_trt_input_shape("image", min_shape=[1, 3, 640, 640])
         option.set_trt_input_shape("scale_factor", min_shape=[1, 2])
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
     elif args.backend.lower() == "ort":
         option.use_ort_backend()
     elif args.backend.lower() == "paddle":
diff --git a/examples/vision/detection/yolov5/quantize/README.md b/examples/vision/detection/yolov5/quantize/README.md
index 16dff9e84c..853718381f 100644
--- a/examples/vision/detection/yolov5/quantize/README.md
+++ b/examples/vision/detection/yolov5/quantize/README.md
@@ -1,22 +1,42 @@
 # YOLOv5量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
 
-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 
 ## 下载量化完成的YOLOv5s模型
-用户也可以直接下载下表中的量化模型进行部署.
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP |量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)            | TensorRT         |    GPU    |  14.13        |  11.22      |      1.26         | 37.6  | 36.6 | 量化蒸馏训练 |
-| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)             | Paddle Inference  |    CPU    |      226.36   |   152.27     |      1.48         |37.6 | 36.8 |量化蒸馏训练 |
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | TensorRT   |    GPU    |  7.87    | 4.51 |  4.31     | 3.17     |      2.48         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | Paddle-TensorRT  |    GPU   |  7.99    |  None |  4.46    | 3.31     |      2.41         | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | ONNX Runtime   |    CPU    |  176.41      |    91.90   |  None |  None |      1.90        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | Paddle Inference|    CPU    |      213.73  |   130.19     |  None  | None |   1.64     |37.6 | 35.2 | 量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | TensorRT   |    GPU    |  24.61   | 21.20 |  20.78     | 20.94     |      1.18         | 37.6  | 36.7 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)               | Paddle-TensorRT  |    GPU   |  23.53    |  None |  21.98    | 19.84     |      1.28        | 37.6  | 36.8 | 量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | ONNX Runtime   |    CPU    |  197.323      |    110.99   |  None |  None |      1.78        | 37.6  | 33.1 |量化蒸馏训练 |
+| [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar)                | Paddle Inference|    CPU    |      235.73  |   144.82     |  None  | None |   1.63     |37.6 | 35.2 | 量化蒸馏训练 |
+
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试图片为COCO val2017中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
 
 ## 详细部署文档
 
diff --git a/examples/vision/detection/yolov5/quantize/cpp/README.md b/examples/vision/detection/yolov5/quantize/cpp/README.md
index 21f351a0e3..7d76bad514 100644
--- a/examples/vision/detection/yolov5/quantize/cpp/README.md
+++ b/examples/vision/detection/yolov5/quantize/cpp/README.md
@@ -9,7 +9,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
 
 ## 以量化后的YOLOv5s模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -31,4 +31,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_demo yolov5s_quant 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_demo yolov5s_quant 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo yolov5s_quant 000000014439.jpg 2
 ```
diff --git a/examples/vision/detection/yolov5/quantize/cpp/infer.cc b/examples/vision/detection/yolov5/quantize/cpp/infer.cc
index 88a9e15fc1..54e9d6dc17 100644
--- a/examples/vision/detection/yolov5/quantize/cpp/infer.cc
+++ b/examples/vision/detection/yolov5/quantize/cpp/infer.cc
@@ -68,7 +68,11 @@ int main(int argc, char* argv[]) {
   } else if (flag == 1) {
     option.UseGpu();
     option.UseTrtBackend();
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }
 
   std::string model_dir = argv[1];
   std::string test_image = argv[2];
diff --git a/examples/vision/detection/yolov5/quantize/python/README.md b/examples/vision/detection/yolov5/quantize/python/README.md
index 00c92dc842..9aa03a8cc0 100644
--- a/examples/vision/detection/yolov5/quantize/python/README.md
+++ b/examples/vision/detection/yolov5/quantize/python/README.md
@@ -8,7 +8,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
 
 
 ## 以量化后的YOLOv5s模型为例, 进行部署
@@ -26,4 +26,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer.py --model yolov5s_quant --image 000000014439.jpg --device cpu --backend paddle
 # 在GPU上使用TensorRT推理量化模型
 python infer.py --model yolov5s_quant --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model yolov5s_quant --image 000000014439.jpg --device gpu --backend pptrt
 ```
diff --git a/examples/vision/detection/yolov5/quantize/python/infer.py b/examples/vision/detection/yolov5/quantize/python/infer.py
index aa56ef18bf..2e420c3605 100644
--- a/examples/vision/detection/yolov5/quantize/python/infer.py
+++ b/examples/vision/detection/yolov5/quantize/python/infer.py
@@ -47,6 +47,11 @@ def build_option(args):
         assert args.device.lower(
         ) == "gpu", "TensorRT backend require inference on device GPU."
         option.use_trt_backend()
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
     elif args.backend.lower() == "ort":
         option.use_ort_backend()
     elif args.backend.lower() == "paddle":
diff --git a/examples/vision/detection/yolov6/quantize/README.md b/examples/vision/detection/yolov6/quantize/README.md
index 594d59e5c6..04af3f6896 100644
--- a/examples/vision/detection/yolov6/quantize/README.md
+++ b/examples/vision/detection/yolov6/quantize/README.md
@@ -1,23 +1,42 @@
 # YOLOv6量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
-
-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
 
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 ## 下载量化完成的YOLOv6s模型
-用户也可以直接下载下表中的量化模型进行部署.
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | TensorRT  |    GPU    |       9.47    |   3.23    |  4.09      |2.81    |  3.37            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | Paddle-TensorRT |    GPU    |       9.31    | None|  4.17  | 2.95       |  3.16            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)          | ONNX Runtime     |    CPU    |   334.65     |  126.38      | None | None|     2.65   |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)             | Paddle Inference  |    CPU    |    352.87   |    123.12    |None | None|     2.87         |42.5| 40.8|量化蒸馏训练 |
+
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | TensorRT  |    GPU    |       15.66    |   11.30   |  10.25      |9.59   |  1.63           | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)            | Paddle-TensorRT |    GPU    |       15.03   | None|  11.36 | 9.32       |  1.61            | 42.5 | 40.7|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)          | ONNX Runtime     |    CPU    |   348.21    |  126.38      | None | None| 2.82       |42.5| 36.8|量化蒸馏训练 |
+| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar)             | Paddle Inference  |    CPU    |    352.87   |    121.64    |None | None|    3.04       |42.5| 40.8|量化蒸馏训练 |
+
 
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- | ------ |
-| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar)             | TensorRT         |    GPU    |       12.89        |   8.92          |  1.45             | 42.5 | 40.6| 量化蒸馏训练 |
-| [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar)            | Paddle Inference  |    CPU    |         366.41      |    131.70         |     2.78          |42.5| 41.2|量化蒸馏训练 |
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试图片为COCO val2017中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
 
 ## 详细部署文档
 
diff --git a/examples/vision/detection/yolov6/quantize/cpp/README.md b/examples/vision/detection/yolov6/quantize/cpp/README.md
index 14a2a94e72..bf2208fab7 100644
--- a/examples/vision/detection/yolov6/quantize/cpp/README.md
+++ b/examples/vision/detection/yolov6/quantize/cpp/README.md
@@ -9,7 +9,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
 
 ## 以量化后的YOLOv6s模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -22,13 +22,15 @@ cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-0.4.0
 make -j
 
 #下载FastDeloy提供的yolov6s量化模型文件和测试图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar
-tar -xvf yolov6s_quant.tar
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_qat_model.tar
+tar -xvf yolov6s_qat_model.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 
 
 # 在CPU上使用Paddle-Inference推理量化模型
-./infer_demo yolov6s_quant 000000014439.jpg 0
+./infer_demo yolov6s_qat_model 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
-./infer_demo yolov6s_quant 000000014439.jpg 1
+./infer_demo yolov6s_qat_model 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo yolov6s_qat_model 000000014439.jpg 2
 ```
diff --git a/examples/vision/detection/yolov6/quantize/cpp/infer.cc b/examples/vision/detection/yolov6/quantize/cpp/infer.cc
index f7a9d2c165..64f4d9f22b 100644
--- a/examples/vision/detection/yolov6/quantize/cpp/infer.cc
+++ b/examples/vision/detection/yolov6/quantize/cpp/infer.cc
@@ -68,7 +68,11 @@ int main(int argc, char* argv[]) {
   } else if (flag == 1) {
     option.UseGpu();
     option.UseTrtBackend();
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }
 
   std::string model_dir = argv[1];
   std::string test_image = argv[2];
diff --git a/examples/vision/detection/yolov6/quantize/python/README.md b/examples/vision/detection/yolov6/quantize/python/README.md
index 03208f46d5..5f70a02c84 100644
--- a/examples/vision/detection/yolov6/quantize/python/README.md
+++ b/examples/vision/detection/yolov6/quantize/python/README.md
@@ -8,7 +8,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
 
 ## 以量化后的YOLOv6s模型为例, 进行部署
 ```bash
@@ -17,12 +17,14 @@ git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd examples/slim/yolov6/python
 
 #下载FastDeloy提供的yolov6s量化模型文件和测试图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_quant.tar
-tar -xvf yolov6s_quant.tar
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_qat_model.tar
+tar -xvf yolov6s_qat_model.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
 
 # 在CPU上使用Paddle-Inference推理量化模型
-python infer.py --model yolov6s_quant --image 000000014439.jpg --device cpu --backend paddle
+python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device cpu --backend paddle
 # 在GPU上使用TensorRT推理量化模型
-python infer.py --model yolov6s_quant --image 000000014439.jpg --device gpu --backend trt
+python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model yolov6s_qat_model --image 000000014439.jpg --device gpu --backend pptrt
 ```
diff --git a/examples/vision/detection/yolov6/quantize/python/infer.py b/examples/vision/detection/yolov6/quantize/python/infer.py
index ec06022724..d34c7cd597 100644
--- a/examples/vision/detection/yolov6/quantize/python/infer.py
+++ b/examples/vision/detection/yolov6/quantize/python/infer.py
@@ -47,6 +47,11 @@ def build_option(args):
         assert args.device.lower(
         ) == "gpu", "TensorRT backend require inference on device GPU."
         option.use_trt_backend()
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
     elif args.backend.lower() == "ort":
         option.use_ort_backend()
     elif args.backend.lower() == "paddle":
diff --git a/examples/vision/detection/yolov7/quantize/README.md b/examples/vision/detection/yolov7/quantize/README.md
index 6d29ea3f36..5795325680 100644
--- a/examples/vision/detection/yolov7/quantize/README.md
+++ b/examples/vision/detection/yolov7/quantize/README.md
@@ -1,23 +1,40 @@
 # YOLOv7量化模型部署
-FastDeploy已支持部署量化模型,并提供一键模型量化的工具.
-用户可以使用一键模型量化工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
 
-## FastDeploy一键模型量化工具
-FastDeploy 提供了一键量化工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
-详细教程请见: [一键模型量化工具](../../../../../tools/quantization/)
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
 
 ## 下载量化完成的YOLOv7模型
-用户也可以直接下载下表中的量化模型进行部署.
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
 
-| 模型                 |推理后端            |部署硬件    | FP32推理时延    | INT8推理时延  | 加速比    | FP32 mAP | INT8 mAP | 量化方式   |
-| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |
-| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | TensorRT          |    GPU    |     30.43          |      15.40       |       1.98        | 51.1| 50.8| 量化蒸馏训练 |
-| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)          | Paddle Inference  |    CPU    |          1015.70     |      562.41       |    1.82           |51.1 | 46.3| 量化蒸馏训练 |
 
-上表中的数据, 为模型量化前后，在FastDeploy部署的端到端推理性能.
-- 测试图片为COCO val2017中的图片.
-- 推理时延为端到端推理(包含前后处理)的平均时延, 单位是毫秒.
-- CPU为Intel(R) Xeon(R) Gold 6271C, GPU为Tesla T4, TensorRT版本8.4.15, 所有测试中固定CPU线程数为1.
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | TensorRT   |    GPU    |     27.47    |  6.52   |  6.74| 5.19|    5.29       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | Paddle-TensorRT |    GPU    |     27.87|None|6.91|5.86      |      4.76       | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | ONNX Runtime     |    CPU    |     996.65        |  467.15 |None|None          |  2.13           | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | Paddle Inference  |    CPU    |     995.85  |     477.93|None|None      |   2.08         |51.1 | 46.2|量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mAP | INT8 mAP | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | TensorRT   |    GPU    |     36.47   |  18.81  |  20.33| 17.58|    2.07      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)            | Paddle-TensorRT |    GPU    |     37.06|None|20.26|17.53    |      2.11      | 51.1| 50.4|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | ONNX Runtime     |    CPU    |     988.85       |  478.08 |None|None          |  2.07          | 51.1 | 43.3|量化蒸馏训练 |
+| [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar)             | Paddle Inference  |    CPU    |     1031.73 |     500.12|None|None      |   2.06         |51.1 | 46.2|量化蒸馏训练 |
 
 ## 详细部署文档
 
diff --git a/examples/vision/detection/yolov7/quantize/cpp/README.md b/examples/vision/detection/yolov7/quantize/cpp/README.md
index 705edda0e6..53110591e0 100644
--- a/examples/vision/detection/yolov7/quantize/cpp/README.md
+++ b/examples/vision/detection/yolov7/quantize/cpp/README.md
@@ -9,7 +9,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
 
 ## 以量化后的YOLOv7模型为例, 进行部署
 在本目录执行如下命令即可完成编译,以及量化模型部署.
@@ -31,4 +31,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 ./infer_demo yolov7_quant 000000014439.jpg 0
 # 在GPU上使用TensorRT推理量化模型
 ./infer_demo yolov7_quant 000000014439.jpg 1
+# 在GPU上使用Paddle-TensorRT推理量化模型
+./infer_demo yolov7_quant 000000014439.jpg 2
 ```
diff --git a/examples/vision/detection/yolov7/quantize/cpp/infer.cc b/examples/vision/detection/yolov7/quantize/cpp/infer.cc
index 45cba4b29a..8a656adee7 100644
--- a/examples/vision/detection/yolov7/quantize/cpp/infer.cc
+++ b/examples/vision/detection/yolov7/quantize/cpp/infer.cc
@@ -68,7 +68,11 @@ int main(int argc, char* argv[]) {
   } else if (flag == 1) {
     option.UseGpu();
     option.UseTrtBackend();
-  }
+  } else if (flag == 2) {
+    option.UseGpu();
+    option.UseTrtBackend();
+    option.EnablePaddleToTrt();
+    }
 
   std::string model_dir = argv[1];
   std::string test_image = argv[2];
diff --git a/examples/vision/detection/yolov7/quantize/python/README.md b/examples/vision/detection/yolov7/quantize/python/README.md
index 1ccc026fd4..ac1c44889b 100644
--- a/examples/vision/detection/yolov7/quantize/python/README.md
+++ b/examples/vision/detection/yolov7/quantize/python/README.md
@@ -8,7 +8,7 @@
 
 ### 量化模型准备
 - 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
-- 2. 用户可以使用FastDeploy提供的[一键模型量化工具](../../../../../../tools/quantization/),自行进行模型量化, 并使用产出的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.
 
 ## 以量化后的YOLOv7模型为例, 进行部署
 ```bash
@@ -25,4 +25,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 python infer.py --model yolov7_quant --image 000000014439.jpg --device cpu --backend paddle
 # 在GPU上使用TensorRT推理量化模型
 python infer.py --model yolov7_quant --image 000000014439.jpg --device gpu --backend trt
+# 在GPU上使用Paddle-TensorRT推理量化模型
+python infer.py --model yolov7_quant --image 000000014439.jpg --device gpu --backend pptrt
 ```
diff --git a/examples/vision/detection/yolov7/quantize/python/infer.py b/examples/vision/detection/yolov7/quantize/python/infer.py
index 3c42679e76..4790a4d94f 100644
--- a/examples/vision/detection/yolov7/quantize/python/infer.py
+++ b/examples/vision/detection/yolov7/quantize/python/infer.py
@@ -47,6 +47,11 @@ def build_option(args):
         assert args.device.lower(
         ) == "gpu", "TensorRT backend require inference on device GPU."
         option.use_trt_backend()
+    elif args.backend.lower() == "pptrt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inference on device GPU."
+        option.use_trt_backend()
+        option.enable_paddle_to_trt()
     elif args.backend.lower() == "ort":
         option.use_ort_backend()
     elif args.backend.lower() == "paddle":
diff --git a/examples/vision/segmentation/paddleseg/quantize/README.md b/examples/vision/segmentation/paddleseg/quantize/README.md
new file mode 100644
index 0000000000..6199c653ac
--- /dev/null
+++ b/examples/vision/segmentation/paddleseg/quantize/README.md
@@ -0,0 +1,36 @@
+# PaddleSeg 量化模型部署
+FastDeploy已支持部署量化模型,并提供一键模型自动化压缩的工具.
+用户可以使用一键模型自动化压缩工具,自行对模型量化后部署, 也可以直接下载FastDeploy提供的量化模型进行部署.
+
+## FastDeploy一键模型自动化压缩工具
+FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输入一个配置文件, 对模型进行量化.
+详细教程请见: [一键模型自动化压缩工具](../../../../../tools/auto_compression/)
+注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可。
+
+## 下载量化完成的PaddleSeg模型
+用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载)
+
+Benchmark表格说明:
+- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间.
+- 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理.
+- 所测时延均为推理1000次后求得的平均值, 单位是毫秒.
+- INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项
+- INT8 + FP16 + PM, 为在推理INT8量化模型和开启FP16的同时, 开启使用Pinned Memory的选项,可加速GPU->CPU数据拷贝的速度
+- 最大加速比, 为FP32时延除以INT8推理的最快时延,得到最大加速比.
+- 策略为量化蒸馏训练时, 采用少量无标签数据集训练得到量化模型, 并在全量验证集上验证精度, INT8精度并不代表最高的INT8精度.
+- CPU为Intel(R) Xeon(R) Gold 6271C, 所有测试中固定CPU线程数为1.  GPU为Tesla T4, TensorRT版本8.4.15.
+
+#### Runtime Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar))  | Paddle Inference |    CPU    |     1138.04|   602.62 |None|None     |      1.89      |77.37 | 71.62 |量化蒸馏训练 |
+
+#### 端到端 Benchmark
+| 模型                 |推理后端            |部署硬件    | FP32 Runtime时延   | INT8 Runtime时延 | INT8 + FP16 Runtime时延  | INT8+FP16+PM Runtime时延  | 最大加速比    | FP32 mIoU | INT8 mIoU | 量化方式   |
+| ------------------- | -----------------|-----------|  --------     |--------      |--------      | --------- |-------- |----- |----- |----- |
+| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar))  | Paddle Inference |    CPU    |     4726.65|   4134.91|None|None     |      1.14      |77.37 | 71.62 |量化蒸馏训练 |
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
diff --git a/examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt b/examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..fea1a2888b
--- /dev/null
+++ b/examples/vision/segmentation/paddleseg/quantize/cpp/CMakeLists.txt
@@ -0,0 +1,14 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.12)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
diff --git a/examples/vision/segmentation/paddleseg/quantize/cpp/README.md b/examples/vision/segmentation/paddleseg/quantize/cpp/README.md
new file mode 100644
index 0000000000..fa334fba41
--- /dev/null
+++ b/examples/vision/segmentation/paddleseg/quantize/cpp/README.md
@@ -0,0 +1,30 @@
+# PaddleSeg 量化模型 C++部署示例
+本目录下提供的`infer.cc`,可以帮助用户快速完成PaddleSeg量化模型在CPU/GPU上的部署推理加速.
+
+## 部署准备
+### FastDeploy环境准备
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+### 量化模型准备
+- 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+
+## 以量化后的PP_LiteSeg_T_STDC1_cityscapes模型为例, 进行部署
+在本目录执行如下命令即可完成编译,以及量化模型部署.
+```bash
+mkdir build
+cd build
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-0.3.0.tgz
+tar xvf fastdeploy-linux-x64-0.3.0.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-0.3.0
+make -j
+
+#下载FastDeloy提供的PP_LiteSeg_T_STDC1_cityscapes量化模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+tar -xvf PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png
+
+# 在CPU上使用Paddle-Inference推理量化模型
+./infer_demo PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ cityscapes_demo.png 1
+```
diff --git a/examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc b/examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc
new file mode 100644
index 0000000000..3e7240dd85
--- /dev/null
+++ b/examples/vision/segmentation/paddleseg/quantize/cpp/infer.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void InitAndInfer(const std::string& model_dir, const std::string& image_file,
+                  const fastdeploy::RuntimeOption& option) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto config_file = model_dir + sep + "deploy.yaml";
+
+  auto model = fastdeploy::vision::segmentation::PaddleSegModel(
+      model_file, params_file, config_file,option);
+
+  assert(model.Initialized());
+
+  auto im = cv::imread(image_file);
+  auto im_bak = im.clone();
+
+  fastdeploy::vision::SegmentationResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+
+  std::cout << res.Str() << std::endl;
+
+}
+
+// int main(int argc, char* argv[]) {
+//   if (argc < 3) {
+//     std::cout
+//         << "Usage: infer_demo path/to/model_dir path/to/image run_option, "
+//            "e.g ./infer_model ./ppseg_model_dir ./test.jpeg 0"
+//         << std::endl;
+//     std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
+//                  "with gpu; 2: run with gpu and use tensorrt backend."
+//               << std::endl;
+//     return -1;
+//   }
+
+//   fastdeploy::RuntimeOption option;
+//   option.UseCpu();
+//   option.UsePaddleBackend();
+//   std::cout<<"Xyy-debug, enable Paddle Backend==!";
+
+//   std::string model_dir = argv[1];
+//   std::string test_image = argv[2];
+//   InitAndInfer(model_dir, test_image, option);
+//   return 0;
+// }
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout << "Usage: infer_demo path/to/quant_model "
+                 "path/to/image "
+                 "run_option, "
+                 "e.g ./infer_demo ./ResNet50_vd_quant ./test.jpeg 0"
+              << std::endl;
+    std::cout << "The data type of run_option is int, 0: run on cpu with ORT "
+                 "backend; 1: run "
+                 "on gpu with TensorRT backend. "
+              << std::endl;
+    return -1;
+  }
+
+  fastdeploy::RuntimeOption option;
+  int flag = std::atoi(argv[3]);
+
+  if (flag == 0) {
+    option.UseCpu();
+    option.UseOrtBackend();
+    std::cout<<"Use ORT!"<<std::endl;
+  } else if (flag == 1) {
+    option.UseCpu();
+    option.UsePaddleBackend();
+    std::cout<<"Use PP!"<<std::endl;
+  }
+
+  std::string model_dir = argv[1];
+  std::string test_image = argv[2];
+  InitAndInfer(model_dir, test_image, option);
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/vision/segmentation/paddleseg/quantize/python/README.md b/examples/vision/segmentation/paddleseg/quantize/python/README.md
new file mode 100644
index 0000000000..9fd3b900bc
--- /dev/null
+++ b/examples/vision/segmentation/paddleseg/quantize/python/README.md
@@ -0,0 +1,28 @@
+# PaddleSeg 量化模型 Python部署示例
+本目录下提供的`infer.py`,可以帮助用户快速完成PaddleSeg量化模型在CPU/GPU上的部署推理加速.
+
+## 部署准备
+### FastDeploy环境准备
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+### 量化模型准备
+- 1. 用户可以直接使用由FastDeploy提供的量化模型进行部署.
+- 2. 用户可以使用FastDeploy提供的[一键模型自动化压缩工具](../../tools/auto_compression/),自行进行模型量化, 并使用产出的量化模型进行部署.(注意: 推理量化后的分类模型仍然需要FP32模型文件夹下的deploy.yaml文件, 自行量化的模型文件夹内不包含此yaml文件, 用户从FP32模型文件夹下复制此yaml文件到量化后的模型文件夹内即可.)
+
+
+## 以量化后的PP_LiteSeg_T_STDC1_cityscapes模型为例, 进行部署
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/segmentation/paddleseg/quantize/python
+
+#下载FastDeloy提供的PP_LiteSeg_T_STDC1_cityscapes量化模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+tar -xvf PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_PTQ.tar
+wget https://paddleseg.bj.bcebos.com/dygraph/demo/cityscapes_demo.png
+
+# 在CPU上使用Paddle-Inference推理量化模型
+python infer.py --model PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT --image cityscapes_demo.png --device cpu --backend paddle
+
+```
diff --git a/examples/vision/segmentation/paddleseg/quantize/python/infer.py b/examples/vision/segmentation/paddleseg/quantize/python/infer.py
new file mode 100644
index 0000000000..d24e367785
--- /dev/null
+++ b/examples/vision/segmentation/paddleseg/quantize/python/infer.py
@@ -0,0 +1,76 @@
+import fastdeploy as fd
+import cv2
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="Path of PaddleSeg model.")
+    parser.add_argument(
+        "--image", required=True, help="Path of test image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="default",
+        help="Type of inference backend, support ort/trt/paddle/openvino, default 'openvino' for cpu, 'tensorrt' for gpu"
+    )
+    parser.add_argument(
+        "--device_id",
+        type=int,
+        default=0,
+        help="Define which GPU card used to run model.")
+    parser.add_argument(
+        "--cpu_thread_num",
+        type=int,
+        default=9,
+        help="Number of threads while inference on CPU.")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+    if args.device.lower() == "gpu":
+        option.use_gpu(0)
+
+    option.set_cpu_thread_num(args.cpu_thread_num)
+
+    if args.backend.lower() == "trt":
+        assert args.device.lower(
+        ) == "gpu", "TensorRT backend require inferences on device GPU."
+        option.use_trt_backend()
+        option.set_trt_cache_file(os.path.join(args.model, "model.trt"))
+        option.set_trt_input_shape("x", [1, 3, 256, 256], [1, 3, 1024, 1024],
+                                   [1, 3, 2048, 2048])
+    elif args.backend.lower() == "ort":
+        option.use_ort_backend()
+    elif args.backend.lower() == "paddle":
+        option.use_paddle_backend()
+    elif args.backend.lower() == "openvino":
+        assert args.device.lower(
+        ) == "cpu", "OpenVINO backend require inference on device CPU."
+        option.use_openvino_backend()
+    return option
+
+
+args = parse_arguments()
+
+# 配置runtime，加载模型
+runtime_option = build_option(args)
+model_file = os.path.join(args.model, "model.pdmodel")
+params_file = os.path.join(args.model, "model.pdiparams")
+config_file = os.path.join(args.model, "deploy.yaml")
+model = fd.vision.segmentation.PaddleSegModel(
+    model_file, params_file, config_file, runtime_option=runtime_option)
+
+# 预测图片检测结果
+im = cv2.imread(args.image)
+result = model.predict(im.copy())
+print(result)
diff --git a/tools/auto_compression/README.md b/tools/auto_compression/README.md
new file mode 100644
index 0000000000..4fbc8dcf09
--- /dev/null
+++ b/tools/auto_compression/README.md
@@ -0,0 +1,129 @@
+# FastDeploy 一键模型自动化压缩
+FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具.
+本文档以Yolov5s为例, 供用户参考如何安装并执行FastDeploy的一键模型自动化压缩.
+
+## 1.安装
+
+### 环境依赖
+
+1.用户参考PaddlePaddle官网, 安装develop版本
+```
+https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html
+```
+
+2.安装paddleslim-develop版本
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim
+python setup.py install
+```
+
+### fastdeploy-auto-compression 一键模型自动化压缩工具安装方式
+用户在当前目录下，运行如下命令:
+```
+python setup.py install
+```
+
+## 2.使用方式
+
+### 一键模型压缩示例
+FastDeploy模型一键自动压缩可包含多种策略, 目前主要采用离线量化和量化蒸馏训练, 下面将从离线量化和量化蒸馏两个策略来介绍如何使用一键模型自动化压缩.
+
+#### 离线量化
+
+##### 1. 准备模型和Calibration数据集
+用户需要自行准备待量化模型与Calibration数据集.
+本例中用户可执行以下命令, 下载待量化的yolov5s.onnx模型和我们为用户准备的Calibration数据集示例.
+
+```shell
+# 下载yolov5.onnx
+wget https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
+
+# 下载数据集, 此Calibration数据集为COCO val2017中的前320张图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/COCO_val_320.tar.gz
+tar -xvf COCO_val_320.tar.gz
+```
+
+##### 2.使用fastdeploy_auto_compress命令，执行一键模型自动化压缩:
+以下命令是对yolov5s模型进行量化, 用户若想量化其他模型, 替换config_path为configs文件夹下的其他模型配置文件即可.
+```shell
+fastdeploy_auto_compress --config_path=./configs/detection/yolov5s_quant.yaml --method='PTQ' --save_dir='./yolov5s_ptq_model/'
+```
+【说明】离线量化（训练后量化）：post-training quantization，缩写是PTQ
+
+##### 3.参数说明
+
+目前用户只需要提供一个定制的模型config文件,并指定量化方法和量化后的模型保存路径即可完成量化.
+
+| 参数                 | 作用                                                         |
+| -------------------- | ------------------------------------------------------------ |
+| --config_path          | 一键压缩所需要的量化配置文件.[详解](./configs/README.md)                        |
+| --method               | 压缩方式选择, 离线量化选PTQ，量化蒸馏训练选QAT     |
+| --save_dir             | 产出的量化后模型路径, 该模型可直接在FastDeploy部署     |
+
+
+
+#### 量化蒸馏训练
+
+##### 1.准备待量化模型和训练数据集
+FastDeploy一键模型自动化压缩目前的量化蒸馏训练，只支持无标注图片训练，训练过程中不支持评估模型精度.
+数据集为真实预测场景下的图片，图片数量依据数据集大小来定，尽量覆盖所有部署场景. 此例中，我们为用户准备了COCO2017训练集中的前320张图片.
+注: 如果用户想通过量化蒸馏训练的方法,获得精度更高的量化模型, 可以自行准备更多的数据, 以及训练更多的轮数.
+
+```shell
+# 下载yolov5.onnx
+wget https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
+
+# 下载数据集, 此Calibration数据集为COCO2017训练集中的前320张图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/COCO_train_320.tar
+tar -xvf COCO_train_320.tar
+```
+
+##### 2.使用fastdeploy_auto_compress命令，执行一键模型自动化压缩:
+以下命令是对yolov5s模型进行量化, 用户若想量化其他模型, 替换config_path为configs文件夹下的其他模型配置文件即可.
+```shell
+# 执行命令默认为单卡训练，训练前请指定单卡GPU, 否则在训练过程中可能会卡住.
+export CUDA_VISIBLE_DEVICES=0
+fastdeploy_auto_compress --config_path=./configs/detection/yolov5s_quant.yaml --method='QAT' --save_dir='./yolov5s_qat_model/'
+```
+
+##### 3.参数说明
+
+目前用户只需要提供一个定制的模型config文件,并指定量化方法和量化后的模型保存路径即可完成量化.
+
+| 参数                 | 作用                                                         |
+| -------------------- | ------------------------------------------------------------ |
+| --config_path          | 一键自动化压缩所需要的量化配置文件.[详解](./configs/README.md)|
+| --method               | 压缩方式选择, 离线量化选PTQ，量化蒸馏训练选QAT     |
+| --save_dir             | 产出的量化后模型路径, 该模型可直接在FastDeploy部署     |
+
+
+## 3. FastDeploy 一键模型自动化压缩 Config文件参考
+FastDeploy目前为用户提供了多个模型的压缩[config](./configs/)文件,以及相应的FP32模型, 用户可以直接下载使用并体验.
+
+| Config文件                | 待压缩的FP32模型 | 备注                                                       |
+| -------------------- | ------------------------------------------------------------ |----------------------------------------- |
+| [mobilenetv1_ssld_quant](./configs/classification/mobilenetv1_ssld_quant.yaml)      | [mobilenetv1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/MobileNetV1_ssld_infer.tgz)           |           |
+| [resnet50_vd_quant](./configs/classification/resnet50_vd_quant.yaml)      |   [resnet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/ResNet50_vd_infer.tgz)          |     |
+| [yolov5s_quant](./configs/detection/yolov5s_quant.yaml)       |   [yolov5s](https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx)         |     |
+| [yolov6s_quant](./configs/detection/yolov6s_quant.yaml)       |  [yolov6s](https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx)          |     |
+| [yolov7_quant](./configs/detection/yolov7_quant.yaml)        | [yolov7](https://paddle-slim-models.bj.bcebos.com/act/yolov7.onnx)           |      |
+| [ppyoloe_withNMS_quant](./configs/detection/ppyoloe_withNMS_quant.yaml)       |  [ppyoloe_l](https://bj.bcebos.com/v1/paddle-slim-models/act/ppyoloe_crn_l_300e_coco.tar)    | 支持PPYOLOE的s,m,l,x系列模型, 从PaddleDetection导出模型时正常导出, 不要去除NMS |
+| [ppyoloe_plus_withNMS_quant](./configs/detection/ppyoloe_plus_withNMS_quant.yaml)       |  [ppyoloe_plus_s](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_plus_crn_s_80e_coco.tar)    | 支持PPYOLOE+的s,m,l,x系列模型, 从PaddleDetection导出模型时正常导出, 不要去除NMS |
+| [pp_liteseg_quant](./configs/segmentation/pp_liteseg_quant.yaml)    |   [pp_liteseg](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer.tgz)        |       |
+
+
+
+## 4. FastDeploy 部署量化模型
+用户在获得量化模型之后，即可以使用FastDeploy进行部署, 部署文档请参考:
+具体请用户参考示例文档:
+- [YOLOv5 量化模型部署](../../examples/vision/detection/yolov5/quantize/)
+
+- [YOLOv6 量化模型部署](../../examples/vision/detection/yolov6/quantize/)
+
+- [YOLOv7 量化模型部署](../../examples/vision/detection/yolov7/quantize/)
+
+- [PadddleClas 量化模型部署](../../examples/vision/classification/paddleclas/quantize/)
+
+- [PadddleDetection 量化模型部署](../../examples/vision/detection/paddledetection/quantize/)
+
+- [PadddleSegmentation 量化模型部署](../../examples/vision/segmentation/paddleseg/quantize/)
diff --git a/tools/auto_compression/configs/README.md b/tools/auto_compression/configs/README.md
new file mode 100644
index 0000000000..9924026560
--- /dev/null
+++ b/tools/auto_compression/configs/README.md
@@ -0,0 +1,54 @@
+# FastDeploy 一键自动化压缩配置文件说明
+FastDeploy 一键自动化压缩配置文件中，包含了全局配置，量化蒸馏训练配置，离线量化配置和训练配置.
+用户除了直接使用FastDeploy提供在本目录的配置文件外，可以按照以下示例,自行修改相关配置文件, 来尝试压缩自己的模型.
+
+## 实例解读
+
+```
+# 全局配置
+Global:
+  model_dir: ./ppyoloe_plus_crn_s_80e_coco    #输入模型的路径, 用户若需量化自己的模型，替换此处即可
+  format: paddle                              #输入模型的格式, paddle模型请选择'paddle', onnx模型选择'onnx'
+  model_filename: model.pdmodel               #量化后转为paddle格式模型的模型名字
+  params_filename: model.pdiparams            #量化后转为paddle格式模型的参数名字
+  qat_image_path: ./COCO_train_320            #量化蒸馏训练使用的数据集,此例为少量无标签数据, 选自COCO2017训练集中的前320张图片, 做少量数据训练
+  ptq_image_path: ./COCO_val_320              #离线训练使用的Carlibration数据集, 选自COCO2017验证集中的前320张图片.
+  input_list: ['image','scale_factor']        #待量化的模型的输入名字
+  qat_preprocess: ppyoloe_plus_withNMS_image_preprocess #模型量化蒸馏训练时,对数据做的预处理函数, 用户可以在 ../fdquant/dataset.py 中修改或自行编写新的预处理函数, 来支自定义模型的量化
+  ptq_preprocess: ppyoloe_plus_withNMS_image_preprocess #模型离线量化时,对数据做的预处理函数, 用户可以在 ../fdquant/dataset.py 中修改或自行编写新的预处理函数, 来支自定义模型的量化
+  qat_batch_size: 4                           #量化蒸馏训练时的batch_size, 若为onnx格式的模型,此处只能为1
+
+
+#量化蒸馏训练配置
+Distillation:
+  alpha: 1.0                                  #蒸馏loss所占权重
+  loss: soft_label                            #蒸馏loss算法
+
+Quantization:
+  onnx_format: true                           #是否采用ONNX量化标准格式, 要在FastDeploy上部署, 必须选true
+  use_pact: true                              #量化训练是否使用PACT方法
+  activation_quantize_type: 'moving_average_abs_max'     #激活量化方式
+  quantize_op_types:                          #需要进行量化的OP
+  - conv2d
+  - depthwise_conv2d
+
+#离线量化配置
+PTQ:
+  calibration_method: 'avg'                   #离线量化的激活校准算法, 可选: avg, abs_max, hist, KL, mse, emd
+  skip_tensor_list: None                      #用户可指定跳过某些conv层,不进行量化
+
+#训练参数配置
+TrainConfig:
+  train_iter: 3000
+  learning_rate: 0.00001
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
+  target_metric: 0.365
+
+```
+## 更多详细配置方法
+
+FastDeploy一键压缩功能由PaddeSlim助力, 更详细的量化配置方法请参考:
+[自动化压缩超参详细教程](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/hyperparameter_tutorial.md)
diff --git a/tools/quantization/configs/classification/mobilenetv1_ssld_quant.yaml b/tools/auto_compression/configs/classification/mobilenetv1_ssld_quant.yaml
similarity index 83%
rename from tools/quantization/configs/classification/mobilenetv1_ssld_quant.yaml
rename to tools/auto_compression/configs/classification/mobilenetv1_ssld_quant.yaml
index aa4ae5a71d..7fa9791102 100644
--- a/tools/quantization/configs/classification/mobilenetv1_ssld_quant.yaml
+++ b/tools/auto_compression/configs/classification/mobilenetv1_ssld_quant.yaml
@@ -3,11 +3,12 @@ Global:
   format: 'paddle'
   model_filename: inference.pdmodel
   params_filename: inference.pdiparams
-  image_path: ./ImageNet_val_640
-  arch: MobileNetV1
+  qat_image_path: ./ImageNet_val_640
+  ptq_image_path: ./ImageNet_val_640
   input_list: ['input']
-  preprocess: cls_image_preprocess
-
+  qat_preprocess: cls_image_preprocess
+  ptq_preprocess: cls_image_preprocess
+  qat_batch_size: 32
 
 Distillation:
   alpha: 1.0
diff --git a/tools/quantization/configs/classification/resnet50_vd_quant.yaml b/tools/auto_compression/configs/classification/resnet50_vd_quant.yaml
similarity index 90%
rename from tools/quantization/configs/classification/resnet50_vd_quant.yaml
rename to tools/auto_compression/configs/classification/resnet50_vd_quant.yaml
index ab2264e506..6de409ac78 100644
--- a/tools/quantization/configs/classification/resnet50_vd_quant.yaml
+++ b/tools/auto_compression/configs/classification/resnet50_vd_quant.yaml
@@ -4,9 +4,10 @@ Global:
   model_filename: inference.pdmodel
   params_filename: inference.pdiparams
   image_path: ./ImageNet_val_640
-  arch: ResNet50
   input_list: ['input']
-  preprocess: cls_image_preprocess
+  qat_preprocess: cls_image_preprocess
+  ptq_preprocess: cls_image_preprocess
+  qat_batch_size: 32
 
 
 Distillation:
diff --git a/tools/auto_compression/configs/detection/ppyoloe_plus_withNMS_quant.yaml b/tools/auto_compression/configs/detection/ppyoloe_plus_withNMS_quant.yaml
new file mode 100644
index 0000000000..2b9f8d7a4d
--- /dev/null
+++ b/tools/auto_compression/configs/detection/ppyoloe_plus_withNMS_quant.yaml
@@ -0,0 +1,39 @@
+Global:
+  model_dir: ./ppyoloe_plus_crn_s_80e_coco
+  format: paddle
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+  qat_image_path: ./COCO_train_320
+  ptq_image_path: ./COCO_val_320
+  input_list: ['image','scale_factor']
+  qat_preprocess: ppyoloe_plus_withNMS_image_preprocess
+  ptq_preprocess: ppyoloe_plus_withNMS_image_preprocess
+  qat_batch_size: 4
+
+Distillation:
+  alpha: 1.0
+  loss: soft_label
+
+Quantization:
+  onnx_format: true
+  use_pact: true
+  activation_quantize_type: 'moving_average_abs_max'
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+
+PTQ:
+  calibration_method: 'avg'   # option: avg, abs_max, hist, KL, mse
+  skip_tensor_list: None
+
+TrainConfig:
+  train_iter: 5000
+  learning_rate:
+    type: CosineAnnealingDecay
+    learning_rate: 0.00003
+    T_max: 6000
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/tools/quantization/configs/detection/ppyoloe_l_quant.yaml b/tools/auto_compression/configs/detection/ppyoloe_withNMS_quant.yaml
similarity index 73%
rename from tools/quantization/configs/detection/ppyoloe_l_quant.yaml
rename to tools/auto_compression/configs/detection/ppyoloe_withNMS_quant.yaml
index 43cbab4f90..e46c11b27a 100644
--- a/tools/quantization/configs/detection/ppyoloe_l_quant.yaml
+++ b/tools/auto_compression/configs/detection/ppyoloe_withNMS_quant.yaml
@@ -1,12 +1,14 @@
 Global:
-  model_dir: ./ppyoloe_crn_l_300e_coco
+  model_dir: ./ppyoloe_crn_s_300e_coco
   format: paddle
   model_filename: model.pdmodel
   params_filename: model.pdiparams
-  image_path: ./COCO_val_320
-  arch: PPYOLOE
+  qat_image_path: ./COCO_train_320
+  ptq_image_path: ./COCO_val_320
   input_list: ['image','scale_factor']
-  preprocess: ppdet_image_preprocess
+  qat_preprocess: ppyoloe_withNMS_image_preprocess
+  ptq_preprocess: ppyoloe_withNMS_image_preprocess
+  qat_batch_size: 4
 
 Distillation:
   alpha: 1.0
diff --git a/tools/quantization/configs/detection/yolov5s_quant.yaml b/tools/auto_compression/configs/detection/yolov5s_quant.yaml
similarity index 78%
rename from tools/quantization/configs/detection/yolov5s_quant.yaml
rename to tools/auto_compression/configs/detection/yolov5s_quant.yaml
index 58d4332e12..72cfef9bac 100644
--- a/tools/quantization/configs/detection/yolov5s_quant.yaml
+++ b/tools/auto_compression/configs/detection/yolov5s_quant.yaml
@@ -3,10 +3,12 @@ Global:
   format: 'onnx'
   model_filename: model.pdmodel
   params_filename: model.pdiparams
-  image_path: ./COCO_val_320
-  arch: YOLOv5
+  qat_image_path: ./COCO_train_320
+  ptq_image_path: ./COCO_val_320
   input_list: ['x2paddle_images']
-  preprocess: yolo_image_preprocess
+  qat_preprocess: yolo_image_preprocess
+  ptq_preprocess: yolo_image_preprocess
+  qat_batch_size: 1
 
 Distillation:
   alpha: 1.0
diff --git a/tools/quantization/configs/detection/yolov6s_quant.yaml b/tools/auto_compression/configs/detection/yolov6s_quant.yaml
similarity index 81%
rename from tools/quantization/configs/detection/yolov6s_quant.yaml
rename to tools/auto_compression/configs/detection/yolov6s_quant.yaml
index 1c35ff3949..ee4986312e 100644
--- a/tools/quantization/configs/detection/yolov6s_quant.yaml
+++ b/tools/auto_compression/configs/detection/yolov6s_quant.yaml
@@ -3,9 +3,12 @@ Global:
   format: 'onnx'
   model_filename: model.pdmodel
   params_filename: model.pdiparams
-  image_path: ./COCO_val_320
-  arch: YOLOv6
+  qat_image_path: ./COCO_train_320
+  ptq_image_path: ./COCO_val_320
   input_list: ['x2paddle_image_arrays']
+  qat_preprocess: yolo_image_preprocess
+  ptq_preprocess: yolo_image_preprocess
+  qat_batch_size: 1
 
 Distillation:
   alpha: 1.0
diff --git a/tools/quantization/configs/detection/yolov7_quant.yaml b/tools/auto_compression/configs/detection/yolov7_quant.yaml
similarity index 79%
rename from tools/quantization/configs/detection/yolov7_quant.yaml
rename to tools/auto_compression/configs/detection/yolov7_quant.yaml
index f04e8dc637..e6cc2c9a95 100644
--- a/tools/quantization/configs/detection/yolov7_quant.yaml
+++ b/tools/auto_compression/configs/detection/yolov7_quant.yaml
@@ -3,9 +3,12 @@ Global:
   format: 'onnx'
   model_filename: model.pdmodel
   params_filename: model.pdiparams
-  image_path: ./COCO_val_320
-  arch: YOLOv7
+  qat_image_path: ./COCO_train_320
+  ptq_image_path: ./COCO_val_320
   input_list: ['x2paddle_images']
+  qat_preprocess: yolo_image_preprocess
+  ptq_preprocess: yolo_image_preprocess
+  qat_batch_size: 1
 
 Distillation:
   alpha: 1.0
diff --git a/tools/auto_compression/configs/segmentation/pp_liteseg_quant.yaml b/tools/auto_compression/configs/segmentation/pp_liteseg_quant.yaml
new file mode 100644
index 0000000000..9c04f65d36
--- /dev/null
+++ b/tools/auto_compression/configs/segmentation/pp_liteseg_quant.yaml
@@ -0,0 +1,37 @@
+Global:
+  model_dir: ./PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer
+  format: paddle
+  model_filename: model.pdmodel
+  params_filename: model.pdiparams
+  qat_image_path: ./train_stuttgart
+  ptq_image_path: ./val_munster
+  input_list: ['x']
+  qat_preprocess: ppseg_cityscapes_qat_preprocess
+  ptq_preprocess: ppseg_cityscapes_ptq_preprocess
+  qat_batch_size: 16
+
+
+Distillation:
+  alpha: 1.0
+  loss: l2
+  node:
+  - conv2d_94.tmp_0
+
+Quantization:
+  onnx_format: True
+  quantize_op_types:
+  - conv2d
+  - depthwise_conv2d
+
+PTQ:
+  calibration_method: 'avg'   # option: avg, abs_max, hist, KL, mse
+  skip_tensor_list: None
+
+TrainConfig:
+  epochs: 10
+  eval_iter: 180
+  learning_rate: 0.0005
+  optimizer_builder:
+    optimizer:
+      type: SGD
+    weight_decay: 4.0e-05
diff --git a/tools/quantization/fdquant/__init__.py b/tools/auto_compression/fd_auto_compress/__init__.py
similarity index 100%
rename from tools/quantization/fdquant/__init__.py
rename to tools/auto_compression/fd_auto_compress/__init__.py
diff --git a/tools/auto_compression/fd_auto_compress/dataset.py b/tools/auto_compression/fd_auto_compress/dataset.py
new file mode 100644
index 0000000000..16f05394f7
--- /dev/null
+++ b/tools/auto_compression/fd_auto_compress/dataset.py
@@ -0,0 +1,388 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import os
+import numpy as np
+import random
+from PIL import Image, ImageEnhance
+import paddle
+"""
+Preprocess for Yolov5/v6/v7 Series
+"""
+
+
+def generate_scale(im, target_shape):
+    origin_shape = im.shape[:2]
+    im_size_min = np.min(origin_shape)
+    im_size_max = np.max(origin_shape)
+    target_size_min = np.min(target_shape)
+    target_size_max = np.max(target_shape)
+    im_scale = float(target_size_min) / float(im_size_min)
+    if np.round(im_scale * im_size_max) > target_size_max:
+        im_scale = float(target_size_max) / float(im_size_max)
+    im_scale_x = im_scale
+    im_scale_y = im_scale
+
+    return im_scale_y, im_scale_x
+
+
+def yolo_image_preprocess(img, target_shape=[640, 640]):
+    # Resize image
+    im_scale_y, im_scale_x = generate_scale(img, target_shape)
+    img = cv2.resize(
+        img,
+        None,
+        None,
+        fx=im_scale_x,
+        fy=im_scale_y,
+        interpolation=cv2.INTER_LINEAR)
+    # Pad
+    im_h, im_w = img.shape[:2]
+    h, w = target_shape[:]
+    if h != im_h or w != im_w:
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array([114.0, 114.0, 114.0], dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = img.astype(np.float32)
+        img = canvas
+    img = np.transpose(img / 255, [2, 0, 1])
+
+    return img.astype(np.float32)
+
+
+"""
+Preprocess for PaddleClas model
+"""
+
+
+def cls_resize_short(img, target_size):
+
+    img_h, img_w = img.shape[:2]
+    percent = float(target_size) / min(img_w, img_h)
+    w = int(round(img_w * percent))
+    h = int(round(img_h * percent))
+
+    return cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
+
+
+def crop_image(img, target_size, center):
+
+    height, width = img.shape[:2]
+    size = target_size
+
+    if center == True:
+        w_start = (width - size) // 2
+        h_start = (height - size) // 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+
+    return img[h_start:h_end, w_start:w_end, :]
+
+
+def cls_image_preprocess(img):
+
+    # resize
+    img = cls_resize_short(img, target_size=256)
+    # crop
+    img = crop_image(img, target_size=224, center=True)
+
+    #ToCHWImage & Normalize
+    img = np.transpose(img / 255, [2, 0, 1])
+
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    return img.astype(np.float32)
+
+
+"""
+Preprocess for PPYOLOE
+"""
+
+
+def ppdet_resize_no_keepratio(img, target_shape=[640, 640]):
+    im_shape = img.shape
+
+    resize_h, resize_w = target_shape
+    im_scale_y = resize_h / im_shape[0]
+    im_scale_x = resize_w / im_shape[1]
+
+    scale_factor = np.asarray([im_scale_y, im_scale_x], dtype=np.float32)
+    return cv2.resize(
+        img, None, None, fx=im_scale_x, fy=im_scale_y,
+        interpolation=2), scale_factor
+
+
+def ppyoloe_withNMS_image_preprocess(img):
+
+    img, scale_factor = ppdet_resize_no_keepratio(img, target_shape=[640, 640])
+
+    img = np.transpose(img / 255, [2, 0, 1])
+
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    return img.astype(np.float32), scale_factor
+
+
+def ppyoloe_plus_withNMS_image_preprocess(img):
+
+    img, scale_factor = ppdet_resize_no_keepratio(img, target_shape=[640, 640])
+
+    img = np.transpose(img / 255, [2, 0, 1])
+
+    return img.astype(np.float32), scale_factor
+
+
+"""
+Preprocess for PP_LiteSeg
+
+"""
+
+
+def ppseg_cityscapes_ptq_preprocess(img):
+
+    #ToCHWImage & Normalize
+    img = np.transpose(img / 255.0, [2, 0, 1])
+
+    img_mean = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1))
+    img_std = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    return img.astype(np.float32)
+
+
+def ResizeStepScaling(img,
+                      min_scale_factor=0.75,
+                      max_scale_factor=1.25,
+                      scale_step_size=0.25):
+    # refer form ppseg
+    if min_scale_factor == max_scale_factor:
+        scale_factor = min_scale_factor
+    elif scale_step_size == 0:
+        scale_factor = np.random.uniform(min_scale_factor, max_scale_factor)
+    else:
+        num_steps = int((max_scale_factor - min_scale_factor) / scale_step_size
+                        + 1)
+        scale_factors = np.linspace(min_scale_factor, max_scale_factor,
+                                    num_steps).tolist()
+        np.random.shuffle(scale_factors)
+        scale_factor = scale_factors[0]
+
+    w = int(round(scale_factor * img.shape[1]))
+    h = int(round(scale_factor * img.shape[0]))
+
+    img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
+
+    return img
+
+
+def RandomPaddingCrop(img,
+                      crop_size=(512, 512),
+                      im_padding_value=(127.5, 127.5, 127.5),
+                      label_padding_value=255):
+
+    if isinstance(crop_size, list) or isinstance(crop_size, tuple):
+        if len(crop_size) != 2:
+            raise ValueError(
+                'Type of `crop_size` is list or tuple. It should include 2 elements, but it is {}'
+                .format(crop_size))
+    else:
+        raise TypeError(
+            "The type of `crop_size` is invalid. It should be list or tuple, but it is {}"
+            .format(type(crop_size)))
+
+    if isinstance(crop_size, int):
+        crop_width = crop_size
+        crop_height = crop_size
+    else:
+        crop_width = crop_size[0]
+        crop_height = crop_size[1]
+
+    img_height = img.shape[0]
+    img_width = img.shape[1]
+
+    if img_height == crop_height and img_width == crop_width:
+        return img
+    else:
+        pad_height = max(crop_height - img_height, 0)
+        pad_width = max(crop_width - img_width, 0)
+        if (pad_height > 0 or pad_width > 0):
+            img = cv2.copyMakeBorder(
+                img,
+                0,
+                pad_height,
+                0,
+                pad_width,
+                cv2.BORDER_CONSTANT,
+                value=im_padding_value)
+
+            img_height = img.shape[0]
+            img_width = img.shape[1]
+
+        if crop_height > 0 and crop_width > 0:
+            h_off = np.random.randint(img_height - crop_height + 1)
+            w_off = np.random.randint(img_width - crop_width + 1)
+
+            img = img[h_off:(crop_height + h_off), w_off:(w_off + crop_width
+                                                          ), :]
+
+        return img
+
+
+def RandomHorizontalFlip(img, prob=0.5):
+    if random.random() < prob:
+
+        if len(img.shape) == 3:
+            img = img[:, ::-1, :]
+        elif len(img.shape) == 2:
+            img = img[:, ::-1]
+
+        return img
+    else:
+        return img
+
+
+def brightness(im, brightness_lower, brightness_upper):
+    brightness_delta = np.random.uniform(brightness_lower, brightness_upper)
+    im = ImageEnhance.Brightness(im).enhance(brightness_delta)
+    return im
+
+
+def contrast(im, contrast_lower, contrast_upper):
+    contrast_delta = np.random.uniform(contrast_lower, contrast_upper)
+    im = ImageEnhance.Contrast(im).enhance(contrast_delta)
+    return im
+
+
+def saturation(im, saturation_lower, saturation_upper):
+    saturation_delta = np.random.uniform(saturation_lower, saturation_upper)
+    im = ImageEnhance.Color(im).enhance(saturation_delta)
+    return im
+
+
+def hue(im, hue_lower, hue_upper):
+    hue_delta = np.random.uniform(hue_lower, hue_upper)
+    im = np.array(im.convert('HSV'))
+    im[:, :, 0] = im[:, :, 0] + hue_delta
+    im = Image.fromarray(im, mode='HSV').convert('RGB')
+    return im
+
+
+def sharpness(im, sharpness_lower, sharpness_upper):
+    sharpness_delta = np.random.uniform(sharpness_lower, sharpness_upper)
+    im = ImageEnhance.Sharpness(im).enhance(sharpness_delta)
+    return im
+
+
+def RandomDistort(img,
+                  brightness_range=0.5,
+                  brightness_prob=0.5,
+                  contrast_range=0.5,
+                  contrast_prob=0.5,
+                  saturation_range=0.5,
+                  saturation_prob=0.5,
+                  hue_range=18,
+                  hue_prob=0.5,
+                  sharpness_range=0.5,
+                  sharpness_prob=0):
+
+    brightness_lower = 1 - brightness_range
+    brightness_upper = 1 + brightness_range
+    contrast_lower = 1 - contrast_range
+    contrast_upper = 1 + contrast_range
+    saturation_lower = 1 - saturation_range
+    saturation_upper = 1 + saturation_range
+    hue_lower = -hue_range
+    hue_upper = hue_range
+    sharpness_lower = 1 - sharpness_range
+    sharpness_upper = 1 + sharpness_range
+    ops = [brightness, contrast, saturation, hue, sharpness]
+    random.shuffle(ops)
+    params_dict = {
+        'brightness': {
+            'brightness_lower': brightness_lower,
+            'brightness_upper': brightness_upper
+        },
+        'contrast': {
+            'contrast_lower': contrast_lower,
+            'contrast_upper': contrast_upper
+        },
+        'saturation': {
+            'saturation_lower': saturation_lower,
+            'saturation_upper': saturation_upper
+        },
+        'hue': {
+            'hue_lower': hue_lower,
+            'hue_upper': hue_upper
+        },
+        'sharpness': {
+            'sharpness_lower': sharpness_lower,
+            'sharpness_upper': sharpness_upper,
+        }
+    }
+    prob_dict = {
+        'brightness': brightness_prob,
+        'contrast': contrast_prob,
+        'saturation': saturation_prob,
+        'hue': hue_prob,
+        'sharpness': sharpness_prob
+    }
+
+    img = img.astype('uint8')
+    img = Image.fromarray(img)
+
+    for id in range(len(ops)):
+        params = params_dict[ops[id].__name__]
+        prob = prob_dict[ops[id].__name__]
+        params['im'] = img
+        if np.random.uniform(0, 1) < prob:
+            img = ops[id](**params)
+    img = np.asarray(img).astype('float32')
+    return img
+
+
+def ppseg_cityscapes_qat_preprocess(img):
+
+    min_scale_factor = 0.5
+    max_scale_factor = 2.0
+    scale_step_size = 0.25
+
+    crop_size = (1024, 512)
+
+    brightness_range = 0.5
+    contrast_range = 0.5
+    saturation_range = 0.5
+
+    img = ResizeStepScaling(
+        img, min_scale_factor=0.5, max_scale_factor=2.0, scale_step_size=0.25)
+    img = RandomPaddingCrop(img, crop_size=(1024, 512))
+    img = RandomHorizontalFlip(img)
+    img = RandomDistort(
+        img, brightness_range=0.5, contrast_range=0.5, saturation_range=0.5)
+
+    img = np.transpose(img / 255.0, [2, 0, 1])
+    img_mean = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1))
+    img_std = np.array([0.5, 0.5, 0.5]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img.astype(np.float32)
diff --git a/tools/quantization/fdquant/fdquant.py b/tools/auto_compression/fd_auto_compress/fd_auto_compress.py
similarity index 59%
rename from tools/quantization/fdquant/fdquant.py
rename to tools/auto_compression/fd_auto_compress/fd_auto_compress.py
index 4d2bb511e1..145f4a4687 100644
--- a/tools/quantization/fdquant/fdquant.py
+++ b/tools/auto_compression/fd_auto_compress/fd_auto_compress.py
@@ -22,7 +22,7 @@ import paddle
 from paddleslim.common import load_config, load_onnx_model
 from paddleslim.auto_compression import AutoCompression
 from paddleslim.quant import quant_post_static
-from fdquant.dataset import *
+from fd_auto_compress.dataset import *
 
 
 def argsparser():
@@ -53,16 +53,33 @@ def argsparser():
     return parser
 
 
-def reader_wrapper(reader, input_list=None):
-    def gen():
-        for data_list in reader:
+def reader_wrapper(reader, input_list):
+
+    if isinstance(input_list, list) and len(input_list) == 1:
+        input_name = input_list[0]
+
+        def gen():
             in_dict = {}
-            for data in data_list:
-                for i, input_name in enumerate(input_list):
-                    in_dict[input_name] = data[i]
+            for i, data in enumerate(reader()):
+                imgs = np.array(data[0])
+                in_dict[input_name] = imgs
                 yield in_dict
 
-    return gen
+        return gen
+
+    if isinstance(input_list, list) and len(input_list) > 1:
+
+        def gen():
+            for idx, data in enumerate(reader()):
+                in_dict = {}
+                for i in range(len(input_list)):
+                    intput_name = input_list[i]
+                    feed_data = np.array(data[0][i])
+                    in_dict[intput_name] = feed_data
+
+                yield in_dict
+
+        return gen
 
 
 def main():
@@ -75,31 +92,32 @@ def main():
 
     assert FLAGS.devices in ['cpu', 'gpu', 'xpu', 'npu']
     paddle.set_device(FLAGS.devices)
-
     global global_config
-    all_config = load_config(FLAGS.config_path)
-    assert "Global" in all_config, f"Key 'Global' not found in config file. \n{all_config}"
-    global_config = all_config["Global"]
-    input_list = global_config['input_list']
 
-    assert os.path.exists(global_config[
-        'image_path']), "image_path does not exist!"
-    paddle.vision.image.set_image_backend('cv2')
-    # transform could be customized.
-    train_dataset = paddle.vision.datasets.ImageFolder(
-        global_config['image_path'],
-        transform=eval(global_config['preprocess']))
-    train_loader = paddle.io.DataLoader(
-        train_dataset,
-        batch_size=1,
-        shuffle=True,
-        drop_last=True,
-        num_workers=0)
-    train_loader = reader_wrapper(train_loader, input_list=input_list)
-    eval_func = None
-
-    # ACT compression
     if FLAGS.method == 'QAT':
+
+        all_config = load_config(FLAGS.config_path)
+        assert "Global" in all_config, f"Key 'Global' not found in config file. \n{all_config}"
+        global_config = all_config["Global"]
+        input_list = global_config['input_list']
+
+        assert os.path.exists(global_config[
+            'qat_image_path']), "image_path does not exist!"
+        paddle.vision.image.set_image_backend('cv2')
+        # transform could be customized.
+        train_dataset = paddle.vision.datasets.ImageFolder(
+            global_config['qat_image_path'],
+            transform=eval(global_config['qat_preprocess']))
+        train_loader = paddle.io.DataLoader(
+            train_dataset,
+            batch_size=global_config['qat_batch_size'],
+            shuffle=True,
+            drop_last=True,
+            num_workers=0)
+        train_loader = reader_wrapper(train_loader, input_list=input_list)
+        eval_func = None
+
+        # ACT compression
         ac = AutoCompression(
             model_dir=global_config['model_dir'],
             model_filename=global_config['model_filename'],
@@ -113,6 +131,28 @@ def main():
     # PTQ compression
     if FLAGS.method == 'PTQ':
 
+        # Read Global config and prepare dataset
+        all_config = load_config(FLAGS.config_path)
+        assert "Global" in all_config, f"Key 'Global' not found in config file. \n{all_config}"
+        global_config = all_config["Global"]
+        input_list = global_config['input_list']
+
+        assert os.path.exists(global_config[
+            'ptq_image_path']), "image_path does not exist!"
+
+        paddle.vision.image.set_image_backend('cv2')
+        # transform could be customized.
+        val_dataset = paddle.vision.datasets.ImageFolder(
+            global_config['ptq_image_path'],
+            transform=eval(global_config['ptq_preprocess']))
+        val_loader = paddle.io.DataLoader(
+            val_dataset,
+            batch_size=1,
+            shuffle=True,
+            drop_last=True,
+            num_workers=0)
+        val_loader = reader_wrapper(val_loader, input_list=input_list)
+
         # Read PTQ config
         assert "PTQ" in all_config, f"Key 'PTQ' not found in config file. \n{all_config}"
         ptq_config = all_config["PTQ"]
@@ -134,7 +174,7 @@ def main():
             executor=exe,
             model_dir=inference_model_path,
             quantize_model_path=FLAGS.save_dir,
-            data_loader=train_loader,
+            data_loader=val_loader,
             model_filename=global_config["model_filename"],
             params_filename=global_config["params_filename"],
             batch_size=32,
diff --git a/tools/quantization/requirements.txt b/tools/auto_compression/requirements.txt
similarity index 100%
rename from tools/quantization/requirements.txt
rename to tools/auto_compression/requirements.txt
diff --git a/tools/auto_compression/setup.py b/tools/auto_compression/setup.py
new file mode 100644
index 0000000000..9ee3ce28fb
--- /dev/null
+++ b/tools/auto_compression/setup.py
@@ -0,0 +1,26 @@
+import setuptools
+import fd_auto_compress
+
+long_description = "fastdeploy-auto-compression is a toolkit for model auto compression of FastDeploy.\n\n"
+long_description += "Usage: fastdeploy_auto_compress --config_path=./yolov7_tiny_qat_dis.yaml --method='QAT' --save_dir='../v7_qat_outmodel/' \n"
+
+with open("requirements.txt") as fin:
+    REQUIRED_PACKAGES = fin.read()
+
+setuptools.setup(
+    name="fastdeploy-auto-compression",  # name of package
+    description="A toolkit for model auto compression of FastDeploy.",
+    long_description=long_description,
+    long_description_content_type="text/plain",
+    packages=setuptools.find_packages(),
+    install_requires=REQUIRED_PACKAGES,
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    license='Apache 2.0',
+    entry_points={
+        'console_scripts':
+        ['fastdeploy_auto_compress=fd_auto_compress.fd_auto_compress:main', ]
+    })
diff --git a/tools/quantization/README.md b/tools/quantization/README.md
deleted file mode 100644
index 4459d526d2..0000000000
--- a/tools/quantization/README.md
+++ /dev/null
@@ -1,108 +0,0 @@
-# FastDeploy 一键模型量化
-FastDeploy基于PaddleSlim, 给用户提供了一键模型量化的工具, 支持离线量化和量化蒸馏训练.
-本文档以Yolov5s为例, 供用户参考如何安装并执行FastDeploy的一键模型量化.
-
-## 1.安装
-
-### 环境依赖
-
-1.用户参考PaddlePaddle官网, 安装develop版本
-```
-https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html
-```
-
-2.安装paddleslim-develop版本
-```bash
-git clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim
-python setup.py install
-```
-
-### FastDeploy-Quantization 安装方式
-用户在当前目录下，运行如下命令:
-```
-python setup.py install
-```
-
-## 2.使用方式
-
-### 一键量化示例
-
-#### 离线量化
-
-##### 1. 准备模型和Calibration数据集
-用户需要自行准备待量化模型与Calibration数据集.
-本例中用户可执行以下命令, 下载待量化的yolov5s.onnx模型和我们为用户准备的Calibration数据集示例.
-
-```shell
-# 下载yolov5.onnx
-wget https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
-
-# 下载数据集, 此Calibration数据集为COCO val2017中的前320张图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/COCO_val_320.tar.gz
-tar -xvf COCO_val_320.tar.gz
-```
-
-##### 2.使用fastdeploy_quant命令，执行一键模型量化:
-以下命令是对yolov5s模型进行量化, 用户若想量化其他模型, 替换config_path为configs文件夹下的其他模型配置文件即可.
-```shell
-fastdeploy_quant --config_path=./configs/detection/yolov5s_quant.yaml --method='PTQ' --save_dir='./yolov5s_ptq_model/'
-```
-【说明】离线量化（训练后量化）：post-training quantization，缩写是PTQ
-
-##### 3.参数说明
-
-目前用户只需要提供一个定制的模型config文件,并指定量化方法和量化后的模型保存路径即可完成量化.
-
-| 参数                 | 作用                                                         |
-| -------------------- | ------------------------------------------------------------ |
-| --config_path          | 一键量化所需要的量化配置文件.[详解](./configs/README.md)                        |
-| --method               | 量化方式选择, 离线量化选PTQ，量化蒸馏训练选QAT     |
-| --save_dir             | 产出的量化后模型路径, 该模型可直接在FastDeploy部署     |
-
-
-
-#### 量化蒸馏训练
-
-##### 1.准备待量化模型和训练数据集
-FastDeploy目前的量化蒸馏训练，只支持无标注图片训练，训练过程中不支持评估模型精度.
-数据集为真实预测场景下的图片，图片数量依据数据集大小来定，尽量覆盖所有部署场景. 此例中，我们为用户准备了COCO2017验证集中的前320张图片.
-注: 如果用户想通过量化蒸馏训练的方法,获得精度更高的量化模型, 可以自行准备更多的数据, 以及训练更多的轮数.
-
-```shell
-# 下载yolov5.onnx
-wget https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx
-
-# 下载数据集, 此Calibration数据集为COCO2017验证集中的前320张图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/COCO_val_320.tar.gz
-tar -xvf COCO_val_320.tar.gz
-```
-
-##### 2.使用fastdeploy_quant命令，执行一键模型量化:
-以下命令是对yolov5s模型进行量化, 用户若想量化其他模型, 替换config_path为configs文件夹下的其他模型配置文件即可.
-```shell
-# 执行命令默认为单卡训练，训练前请指定单卡GPU, 否则在训练过程中可能会卡住.
-export CUDA_VISIBLE_DEVICES=0
-fastdeploy_quant --config_path=./configs/detection/yolov5s_quant.yaml --method='QAT' --save_dir='./yolov5s_qat_model/'
-```
-
-##### 3.参数说明
-
-目前用户只需要提供一个定制的模型config文件,并指定量化方法和量化后的模型保存路径即可完成量化.
-
-| 参数                 | 作用                                                         |
-| -------------------- | ------------------------------------------------------------ |
-| --config_path          | 一键量化所需要的量化配置文件.[详解](./configs/README.md)|
-| --method               | 量化方式选择, 离线量化选PTQ，量化蒸馏训练选QAT     |
-| --save_dir             | 产出的量化后模型路径, 该模型可直接在FastDeploy部署     |
-
-
-## 3. FastDeploy 部署量化模型
-用户在获得量化模型之后，即可以使用FastDeploy进行部署, 部署文档请参考:
-具体请用户参考示例文档:
-- [YOLOv5 量化模型部署](../../examples/vision/detection/yolov5/quantize/)
-
-- [YOLOv6 量化模型部署](../../examples/vision/detection/yolov6/quantize/)
-
-- [YOLOv7 量化模型部署](../../examples/vision/detection/yolov7/quantize/)
-
-- [PadddleClas 量化模型部署](../../examples/vision/classification/paddleclas/quantize/)
diff --git a/tools/quantization/configs/README.md b/tools/quantization/configs/README.md
deleted file mode 100644
index 7bab2de34b..0000000000
--- a/tools/quantization/configs/README.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# FastDeploy 量化配置文件说明
-FastDeploy 量化配置文件中，包含了全局配置，量化蒸馏训练配置，离线量化配置和训练配置.
-用户除了直接使用FastDeploy提供在本目录的配置文件外，可以按需求自行修改相关配置文件
-
-## 实例解读
-
-```
-# 全局配置
-Global:
-  model_dir: ./yolov5s.onnx                   #输入模型的路径
-  format: 'onnx'                              #输入模型的格式, paddle模型请选择'paddle'
-  model_filename: model.pdmodel               #量化后转为paddle格式模型的模型名字
-  params_filename: model.pdiparams            #量化后转为paddle格式模型的参数名字
-  image_path: ./COCO_val_320                  #离线量化或者量化蒸馏训练使用的数据集路径
-  arch: YOLOv5                                #模型结构
-  input_list: ['x2paddle_images']             #待量化的模型的输入名字
-  preprocess: yolo_image_preprocess           #模型量化时,对数据做的预处理函数, 用户可以在 ../fdquant/dataset.py 中修改或自行编写新的预处理函数
-
-#量化蒸馏训练配置
-Distillation:
-  alpha: 1.0                                  #蒸馏loss所占权重
-  loss: soft_label                            #蒸馏loss算法
-
-Quantization:
-  onnx_format: true                           #是否采用ONNX量化标准格式, 要在FastDeploy上部署, 必须选true
-  use_pact: true                              #量化训练是否使用PACT方法
-  activation_quantize_type: 'moving_average_abs_max'     #激活量化方式
-  quantize_op_types:                          #需要进行量化的OP
-  - conv2d
-  - depthwise_conv2d
-
-#离线量化配置
-PTQ:
-  calibration_method: 'avg'                   #离线量化的激活校准算法, 可选: avg, abs_max, hist, KL, mse, emd
-  skip_tensor_list: None                      #用户可指定跳过某些conv层,不进行量化
-
-#训练参数配置
-TrainConfig:
-  train_iter: 3000
-  learning_rate: 0.00001
-  optimizer_builder:
-    optimizer:
-      type: SGD
-    weight_decay: 4.0e-05
-  target_metric: 0.365
-
-```
-## 更多详细配置方法
-
-FastDeploy一键量化功能由PaddeSlim助力, 更详细的量化配置方法请参考:
-[自动化压缩超参详细教程](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/auto_compression/hyperparameter_tutorial.md)
diff --git a/tools/quantization/fdquant/dataset.py b/tools/quantization/fdquant/dataset.py
deleted file mode 100644
index a373d973de..0000000000
--- a/tools/quantization/fdquant/dataset.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cv2
-import os
-import numpy as np
-import paddle
-
-
-def generate_scale(im, target_shape):
-    origin_shape = im.shape[:2]
-    im_size_min = np.min(origin_shape)
-    im_size_max = np.max(origin_shape)
-    target_size_min = np.min(target_shape)
-    target_size_max = np.max(target_shape)
-    im_scale = float(target_size_min) / float(im_size_min)
-    if np.round(im_scale * im_size_max) > target_size_max:
-        im_scale = float(target_size_max) / float(im_size_max)
-    im_scale_x = im_scale
-    im_scale_y = im_scale
-
-    return im_scale_y, im_scale_x
-
-
-def yolo_image_preprocess(img, target_shape=[640, 640]):
-    # Resize image
-    im_scale_y, im_scale_x = generate_scale(img, target_shape)
-    img = cv2.resize(
-        img,
-        None,
-        None,
-        fx=im_scale_x,
-        fy=im_scale_y,
-        interpolation=cv2.INTER_LINEAR)
-    # Pad
-    im_h, im_w = img.shape[:2]
-    h, w = target_shape[:]
-    if h != im_h or w != im_w:
-        canvas = np.ones((h, w, 3), dtype=np.float32)
-        canvas *= np.array([114.0, 114.0, 114.0], dtype=np.float32)
-        canvas[0:im_h, 0:im_w, :] = img.astype(np.float32)
-        img = canvas
-    img = np.transpose(img / 255, [2, 0, 1])
-
-    return img.astype(np.float32)
-
-
-def cls_resize_short(img, target_size):
-
-    img_h, img_w = img.shape[:2]
-    percent = float(target_size) / min(img_w, img_h)
-    w = int(round(img_w * percent))
-    h = int(round(img_h * percent))
-
-    return cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
-
-
-def crop_image(img, target_size, center):
-
-    height, width = img.shape[:2]
-    size = target_size
-
-    if center == True:
-        w_start = (width - size) // 2
-        h_start = (height - size) // 2
-    else:
-        w_start = np.random.randint(0, width - size + 1)
-        h_start = np.random.randint(0, height - size + 1)
-    w_end = w_start + size
-    h_end = h_start + size
-
-    return img[h_start:h_end, w_start:w_end, :]
-
-
-def cls_image_preprocess(img):
-
-    # resize
-    img = cls_resize_short(img, target_size=256)
-    # crop
-    img = crop_image(img, target_size=224, center=True)
-
-    #ToCHWImage & Normalize
-    img = np.transpose(img / 255, [2, 0, 1])
-
-    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-    img -= img_mean
-    img /= img_std
-
-    return img.astype(np.float32)
-
-
-def ppdet_resize_no_keepratio(img, target_shape=[640, 640]):
-    im_shape = img.shape
-
-    resize_h, resize_w = target_shape
-    im_scale_y = resize_h / im_shape[0]
-    im_scale_x = resize_w / im_shape[1]
-
-    scale_factor = np.asarray([im_scale_y, im_scale_x], dtype=np.float32)
-    return cv2.resize(
-        img, None, None, fx=im_scale_x, fy=im_scale_y,
-        interpolation=2), scale_factor
-
-
-def ppdet_normliaze(img, is_scale=True):
-
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-    img = img.astype(np.float32, copy=False)
-
-    if is_scale:
-        scale = 1.0 / 255.0
-        img *= scale
-
-    mean = np.array(mean)[np.newaxis, np.newaxis, :]
-    std = np.array(std)[np.newaxis, np.newaxis, :]
-    img -= mean
-    img /= std
-    return img
-
-
-def hwc_to_chw(img):
-    img = img.transpose((2, 0, 1))
-    return img
-
-
-def ppdet_image_preprocess(img):
-
-    img, scale_factor = ppdet_resize_no_keepratio(img, target_shape=[640, 640])
-
-    img = np.transpose(img / 255, [2, 0, 1])
-
-    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-    img -= img_mean
-    img /= img_std
-
-    return img.astype(np.float32), scale_factor
diff --git a/tools/quantization/setup.py b/tools/quantization/setup.py
deleted file mode 100644
index a0c0c2fc0f..0000000000
--- a/tools/quantization/setup.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import setuptools
-import fdquant
-
-long_description = "FDQuant is a toolkit for model quantization of FastDeploy.\n\n"
-long_description += "Usage: fastdeploy_quant --config_path=./yolov7_tiny_qat_dis.yaml --method='QAT' --save_dir='../v7_qat_outmodel/' \n"
-
-with open("requirements.txt") as fin:
-    REQUIRED_PACKAGES = fin.read()
-
-setuptools.setup(
-    name="fastdeploy-quantization",  # name of package
-    description="A toolkit for model quantization of FastDeploy.",
-    long_description=long_description,
-    long_description_content_type="text/plain",
-    packages=setuptools.find_packages(),
-    install_requires=REQUIRED_PACKAGES,
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: OS Independent",
-    ],
-    license='Apache 2.0',
-    entry_points={
-        'console_scripts': ['fastdeploy_quant=fdquant.fdquant:main', ]
-    })

From 39229bf4e0213d2819b5c86090a4cfa7054b1dd5 Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Wed, 2 Nov 2022 20:30:00 +0800
Subject: [PATCH 02/18] [Bug Fix] Fixed PFLD infer demo (#482)

* support face alignment PFLD

* add PFLD demo

* fixed FaceAlignmentResult

* fixed bugs

* fixed img size

* fixed readme

* deal with comments

* fixed readme

* add pfld testcase

* update infer.py

* add gflags for example

* update c++ readme

* add gflags in example

* fixed for ci

* fixed gflags.cmake

* deal with comments

* update infer demo

* fixed bug in infer.cc

Co-authored-by: Jason <jiangjiajun@baidu.com>
---
 examples/vision/facealign/pfld/cpp/infer.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 examples/vision/facealign/pfld/cpp/infer.cc

diff --git a/examples/vision/facealign/pfld/cpp/infer.cc b/examples/vision/facealign/pfld/cpp/infer.cc
old mode 100644
new mode 100755
index 5b926709fa..3c51d9016f
--- a/examples/vision/facealign/pfld/cpp/infer.cc
+++ b/examples/vision/facealign/pfld/cpp/infer.cc
@@ -35,9 +35,9 @@ void PrintUsage() {
 }
 
 bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
-  if (FLAG_device == "gpu") {
+  if (FLAGS_device == "gpu") {
     option->UseGpu();
-    if (FLAG_backend == "ort") {
+    if (FLAGS_backend == "ort") {
       option->UseOrtBackend();
     } else if (FLAGS_backend == "paddle") {
       option->UsePaddleBackend();
@@ -54,24 +54,24 @@ bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
     } else if (FLAGS_backend == "default") {
       return true;
     } else {
-      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAG_backend << " is not supported." << std::endl;
+      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAGS_backend << " is not supported." << std::endl;
       return false;
     }
-  } else if (FLAG_device == "cpu") {
+  } else if (FLAGS_device == "cpu") {
     if (FLAGS_backend == "ort") {
       option->UseOrtBackend();
     } else if (FLAGS_backend == "ov") {
       option->UseOpenVINOBackend();
     } else if (FLAGS_backend == "paddle") {
       option->UsePaddleBackend();
-    } else if (FLAGS_backend = "default") {
+    } else if (FLAGS_backend == "default") {
       return true;
     } else {
-      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAG_backend << " is not supported." << std::endl;
+      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAGS_backend << " is not supported." << std::endl;
       return false;
     }
   } else {
-    std::cerr << "Only support device CPU/GPU now, "  << FLAG_device << " is not supported." << std::endl;
+    std::cerr << "Only support device CPU/GPU now, "  << FLAGS_device << " is not supported." << std::endl;
     return false;
   }
 

From 3fbfee07109f2c750ccdd0ae72da569262234d72 Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Wed, 2 Nov 2022 20:41:40 +0800
Subject: [PATCH 03/18] [Benchmark] Update benchmark build_option (#485)

* add paddle_trt in benchmark

* update benchmark in device
---
 benchmark/benchmark_ppcls.py | 80 ++++++++++++++++++++++++------------
 benchmark/benchmark_ppdet.py | 80 ++++++++++++++++++++++++------------
 benchmark/benchmark_ppseg.py | 80 ++++++++++++++++++++++++------------
 benchmark/benchmark_yolo.py  | 80 ++++++++++++++++++++++++------------
 4 files changed, 216 insertions(+), 104 deletions(-)
 mode change 100644 => 100755 benchmark/benchmark_ppcls.py
 mode change 100644 => 100755 benchmark/benchmark_ppdet.py
 mode change 100644 => 100755 benchmark/benchmark_ppseg.py
 mode change 100644 => 100755 benchmark/benchmark_yolo.py

diff --git a/benchmark/benchmark_ppcls.py b/benchmark/benchmark_ppcls.py
old mode 100644
new mode 100755
index 5b05bfe668..914ace71b0
--- a/benchmark/benchmark_ppcls.py
+++ b/benchmark/benchmark_ppcls.py
@@ -22,9 +22,19 @@ import GPUtil
 import time
 
 
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() == 'true':
+        return True
+    elif v.lower() == 'false':
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
 def parse_arguments():
     import argparse
-    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of PaddleClas model.")
@@ -50,16 +60,16 @@ def parse_arguments():
     parser.add_argument(
         "--backend",
         type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
@@ -70,26 +80,43 @@ def build_option(args):
     option = fd.RuntimeOption()
     device = args.device
     backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
     else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
 
     return option
 
@@ -123,6 +150,7 @@ if __name__ == '__main__':
     config_file = os.path.join(args.model, "inference_cls.yaml")
 
     gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
     end2end_statis = list()
     cpu_mem = list()
     gpu_mem = list()
@@ -149,7 +177,7 @@ if __name__ == '__main__':
             start = time.time()
             result = model.predict(im)
             end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                 gpu_util.append(get_current_gputil(gpu_id))
                 cm, gm = get_current_memory_mb(gpu_id)
                 cpu_mem.append(cm)
@@ -159,7 +187,7 @@ if __name__ == '__main__':
 
         warmup_iter = args.iter_num // 5
         end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             cpu_mem_repeat = cpu_mem[warmup_iter:]
             gpu_mem_repeat = gpu_mem[warmup_iter:]
             gpu_util_repeat = gpu_util[warmup_iter:]
@@ -167,14 +195,14 @@ if __name__ == '__main__':
         dump_result = dict()
         dump_result["runtime"] = runtime_statis["avg_time"] * 1000
         dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
             dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
             dump_result["gpu_util"] = np.mean(gpu_util_repeat)
 
         f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
         f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))
             f.writelines("gpu_rss_mb: {} \n".format(
diff --git a/benchmark/benchmark_ppdet.py b/benchmark/benchmark_ppdet.py
old mode 100644
new mode 100755
index 2f192de065..cb8d47f44e
--- a/benchmark/benchmark_ppdet.py
+++ b/benchmark/benchmark_ppdet.py
@@ -24,9 +24,19 @@ import GPUtil
 import time
 
 
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() == 'true':
+        return True
+    elif v.lower() == 'false':
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
 def parse_arguments():
     import argparse
-    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of PaddleDetection model.")
@@ -52,16 +62,16 @@ def parse_arguments():
     parser.add_argument(
         "--backend",
         type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
@@ -72,26 +82,43 @@ def build_option(args):
     option = fd.RuntimeOption()
     device = args.device
     backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
     else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
 
     return option
 
@@ -125,6 +152,7 @@ if __name__ == '__main__':
     config_file = os.path.join(args.model, "infer_cfg.yml")
 
     gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
     end2end_statis = list()
     cpu_mem = list()
     gpu_mem = list()
@@ -169,7 +197,7 @@ if __name__ == '__main__':
             start = time.time()
             result = model.predict(im)
             end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                 gpu_util.append(get_current_gputil(gpu_id))
                 cm, gm = get_current_memory_mb(gpu_id)
                 cpu_mem.append(cm)
@@ -179,7 +207,7 @@ if __name__ == '__main__':
 
         warmup_iter = args.iter_num // 5
         end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             cpu_mem_repeat = cpu_mem[warmup_iter:]
             gpu_mem_repeat = gpu_mem[warmup_iter:]
             gpu_util_repeat = gpu_util[warmup_iter:]
@@ -187,14 +215,14 @@ if __name__ == '__main__':
         dump_result = dict()
         dump_result["runtime"] = runtime_statis["avg_time"] * 1000
         dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
             dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
             dump_result["gpu_util"] = np.mean(gpu_util_repeat)
 
         f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
         f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))
             f.writelines("gpu_rss_mb: {} \n".format(
diff --git a/benchmark/benchmark_ppseg.py b/benchmark/benchmark_ppseg.py
old mode 100644
new mode 100755
index 81e0db797a..2c7a37c2f1
--- a/benchmark/benchmark_ppseg.py
+++ b/benchmark/benchmark_ppseg.py
@@ -22,9 +22,19 @@ import GPUtil
 import time
 
 
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() == 'true':
+        return True
+    elif v.lower() == 'false':
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
 def parse_arguments():
     import argparse
-    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of PaddleSeg model.")
@@ -50,16 +60,16 @@ def parse_arguments():
     parser.add_argument(
         "--backend",
         type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
@@ -70,26 +80,43 @@ def build_option(args):
     option = fd.RuntimeOption()
     device = args.device
     backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
     else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
 
     return option
 
@@ -123,6 +150,7 @@ if __name__ == '__main__':
     config_file = os.path.join(args.model, "deploy.yaml")
 
     gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
     end2end_statis = list()
     cpu_mem = list()
     gpu_mem = list()
@@ -148,7 +176,7 @@ if __name__ == '__main__':
             start = time.time()
             result = model.predict(im)
             end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                 gpu_util.append(get_current_gputil(gpu_id))
                 cm, gm = get_current_memory_mb(gpu_id)
                 cpu_mem.append(cm)
@@ -158,7 +186,7 @@ if __name__ == '__main__':
 
         warmup_iter = args.iter_num // 5
         end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             cpu_mem_repeat = cpu_mem[warmup_iter:]
             gpu_mem_repeat = gpu_mem[warmup_iter:]
             gpu_util_repeat = gpu_util[warmup_iter:]
@@ -166,14 +194,14 @@ if __name__ == '__main__':
         dump_result = dict()
         dump_result["runtime"] = runtime_statis["avg_time"] * 1000
         dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
             dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
             dump_result["gpu_util"] = np.mean(gpu_util_repeat)
 
         f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
         f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))
             f.writelines("gpu_rss_mb: {} \n".format(
diff --git a/benchmark/benchmark_yolo.py b/benchmark/benchmark_yolo.py
old mode 100644
new mode 100755
index 65e89a516a..f534c43f3e
--- a/benchmark/benchmark_yolo.py
+++ b/benchmark/benchmark_yolo.py
@@ -24,9 +24,19 @@ import GPUtil
 import time
 
 
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() == 'true':
+        return True
+    elif v.lower() == 'false':
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
 def parse_arguments():
     import argparse
-    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of Yolo onnx model.")
@@ -52,16 +62,16 @@ def parse_arguments():
     parser.add_argument(
         "--backend",
         type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=bool,
+        type=str2bool,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
@@ -72,26 +82,43 @@ def build_option(args):
     option = fd.RuntimeOption()
     device = args.device
     backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
-        option.use_gpu(args.device_id)
-
-    if backend == "trt" or backend == "paddle_trt":
-        assert device == "gpu", "the trt backend need device==gpu"
-        option.use_trt_backend()
-        if backend == "paddle_trt":
-            option.enable_paddle_to_trt()
-        if args.enable_trt_fp16:
-            option.enable_trt_fp16()
-    elif backend == "ov":
-        assert device == "cpu", "the openvino backend need device==cpu"
-        option.use_openvino_backend()
-    elif backend == "paddle":
-        option.use_paddle_backend()
-    elif backend == "ort":
-        option.use_ort_backend()
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
     else:
-        print("%s is an unsupported backend" % backend)
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
 
     return option
 
@@ -123,6 +150,7 @@ if __name__ == '__main__':
     model_file = args.model
 
     gpu_id = args.device_id
+    enable_collect_memory_info = args.enable_collect_memory_info
     end2end_statis = list()
     cpu_mem = list()
     gpu_mem = list()
@@ -161,7 +189,7 @@ if __name__ == '__main__':
             start = time.time()
             result = model.predict(im)
             end2end_statis.append(time.time() - start)
-            if args.enable_collect_memory_info:
+            if enable_collect_memory_info:
                 gpu_util.append(get_current_gputil(gpu_id))
                 cm, gm = get_current_memory_mb(gpu_id)
                 cpu_mem.append(cm)
@@ -171,7 +199,7 @@ if __name__ == '__main__':
 
         warmup_iter = args.iter_num // 5
         end2end_statis_repeat = end2end_statis[warmup_iter:]
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             cpu_mem_repeat = cpu_mem[warmup_iter:]
             gpu_mem_repeat = gpu_mem[warmup_iter:]
             gpu_util_repeat = gpu_util[warmup_iter:]
@@ -179,14 +207,14 @@ if __name__ == '__main__':
         dump_result = dict()
         dump_result["runtime"] = runtime_statis["avg_time"] * 1000
         dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat)
             dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat)
             dump_result["gpu_util"] = np.mean(gpu_util_repeat)
 
         f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
         f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
-        if args.enable_collect_memory_info:
+        if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))
             f.writelines("gpu_rss_mb: {} \n".format(

From 328212f27058415ec357b6a001deff3c0b1c0ed3 Mon Sep 17 00:00:00 2001
From: yunyaoXYY <109218879+yunyaoXYY@users.noreply.github.com>
Date: Thu, 3 Nov 2022 00:17:25 +0800
Subject: [PATCH 04/18] Fix PP-OCR Rec model bug (#484)

* Imporve OCR Readme

* Improve OCR Readme

* Improve OCR Readme

* Improve OCR Readme

* Improve OCR Readme

* Add Initialize function to PP-OCR

* Add Initialize function to PP-OCR

* Add Initialize function to PP-OCR

* Make all the model links come from PaddleOCR

* Improve OCR readme

* Improve OCR readme

* Improve OCR readme

* Improve OCR readme

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add Readme for vision results

* Add check for label file in postprocess of Rec model

* Add check for label file in postprocess of Rec model

* Add check for label file in postprocess of Rec model

* Add check for label file in postprocess of Rec model

* Add check for label file in postprocess of Rec model

* Add check for label file in postprocess of Rec model

* Add comments to create API docs

* Improve OCR comments

* Rename OCR and add comments

* Make sure previous python example works

* Make sure previous python example works

* Fix Rec model bug

* Fix Rec model bug

* Fix rec model bug

Co-authored-by: Jason <jiangjiajun@baidu.com>
---
 fastdeploy/vision/ocr/ppocr/recognizer.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/vision/ocr/ppocr/recognizer.cc b/fastdeploy/vision/ocr/ppocr/recognizer.cc
index 4ca52df12d..f0564ce339 100644
--- a/fastdeploy/vision/ocr/ppocr/recognizer.cc
+++ b/fastdeploy/vision/ocr/ppocr/recognizer.cc
@@ -164,8 +164,10 @@ bool Recognizer::Postprocess(FDTensor& infer_result,
     }
     last_index = argmax_idx;
   }
-  score /= count;
-
+  score /= (count + 1e-6);
+  if (count == 0 || std::isnan(score)) {
+    score = 0.f;
+  }
   std::get<0>(*rec_result) = str_res;
   std::get<1>(*rec_result) = score;
 

From 22d60fdadfc51d6c05a837acf88caf76b0e72d8c Mon Sep 17 00:00:00 2001
From: ChaoII <849453582@qq.com>
Date: Thu, 3 Nov 2022 09:57:07 +0800
Subject: [PATCH 05/18] [Model] add tracking trail on vis_mot (#461)

* add override mark

* delete some

* recovery

* recovery

* add tracking

* add tracking py_bind and example

* add pptracking

* add pptracking

* iomanip head file

* add opencv_video lib

* add python libs package

Signed-off-by: ChaoII <849453582@qq.com>

* complete comments

Signed-off-by: ChaoII <849453582@qq.com>

* add jdeTracker_ member variable

Signed-off-by: ChaoII <849453582@qq.com>

* add 'FASTDEPLOY_DECL' macro

Signed-off-by: ChaoII <849453582@qq.com>

* remove kwargs params

Signed-off-by: ChaoII <849453582@qq.com>

* [Doc]update pptracking docs

* delete 'ENABLE_PADDLE_FRONTEND' switch

* add pptracking unit test

* update pptracking unit test

Signed-off-by: ChaoII <849453582@qq.com>

* modify test video file path and remove trt test

* update unit test model url

* remove 'FASTDEPLOY_DECL' macro

Signed-off-by: ChaoII <849453582@qq.com>

* fix build python packages about pptracking on win32

Signed-off-by: ChaoII <849453582@qq.com>

* update comment

Signed-off-by: ChaoII <849453582@qq.com>

* add pptracking model explain

Signed-off-by: ChaoII <849453582@qq.com>

* add tracking trail on vis_mot

* add tracking trail

* modify code for  some suggestion

* remove unused import

* fix import bug

Signed-off-by: ChaoII <849453582@qq.com>
Co-authored-by: Jason <jiangjiajun@baidu.com>
---
 docs/api/vision_results/mot_result.md         |   1 -
 examples/vision/README.md                     |  22 ++--
 .../vision/tracking/pptracking/cpp/infer.cc   |  54 +++++----
 .../tracking/pptracking/python/infer.py       |  21 ++--
 fastdeploy/vision/common/result.h             |   2 +
 .../vision/tracking/pptracking/model.cc       |  33 +++++-
 fastdeploy/vision/tracking/pptracking/model.h |  27 +++++
 .../tracking/pptracking/pptracking_pybind.cc  |   9 +-
 .../vision/tracking/pptracking/trajectory.cc  |   4 +-
 .../vision/tracking/pptracking/trajectory.h   |   4 +-
 fastdeploy/vision/visualize/mot.cc            | 104 ++++++++----------
 fastdeploy/vision/visualize/visualize.h       |   7 +-
 .../vision/visualize/visualize_pybind.cc      |   9 +-
 python/fastdeploy/vision/tracking/__init__.py |   7 +-
 .../vision/tracking/pptracking/__init__.py    |  15 +++
 .../fastdeploy/vision/visualize/__init__.py   |   5 +-
 16 files changed, 208 insertions(+), 116 deletions(-)

diff --git a/docs/api/vision_results/mot_result.md b/docs/api/vision_results/mot_result.md
index 0dd7cda712..4ce8d6cfb3 100644
--- a/docs/api/vision_results/mot_result.md
+++ b/docs/api/vision_results/mot_result.md
@@ -37,4 +37,3 @@ fastdeploy.vision.MOTResult
 - **ids**(list of list(float)):成员变量，表示单帧画面中所有目标的id，其元素个数与`boxes`一致
 - **scores**(list of float): 成员变量，表示单帧画面检测出来的所有目标置信度
 - **class_ids**(list of int): 成员变量，表示单帧画面出来的所有目标类别
-
diff --git a/examples/vision/README.md b/examples/vision/README.md
index ca56edd489..03cdf7f404 100644
--- a/examples/vision/README.md
+++ b/examples/vision/README.md
@@ -2,16 +2,18 @@
 
 本目录下提供了各类视觉模型的部署，主要涵盖以下任务类型
 
-| 任务类型           | 说明                                  | 预测结果结构体                                                                          |
-|:-------------- |:----------------------------------- |:-------------------------------------------------------------------------------- |
-| Detection      | 目标检测，输入图像，检测图像中物体位置，并返回检测框坐标及类别和置信度 | [DetectionResult](../../docs/api/vision_results/detection_result.md)       |
-| Segmentation   | 语义分割，输入图像，给出图像中每个像素的分类及置信度          | [SegmentationResult](../../docs/api/vision_results/segmentation_result.md) |
-| Classification | 图像分类，输入图像，给出图像的分类结果和置信度             | [ClassifyResult](../../docs/api/vision_results/classification_result.md)   |
-| FaceDetection | 人脸检测，输入图像，检测图像中人脸位置，并返回检测框坐标及人脸关键点             | [FaceDetectionResult](../../docs/api/vision_results/face_detection_result.md)   |
-| KeypointDetection   | 关键点检测，输入图像，返回图像中人物行为的各个关键点坐标和置信度         | [KeyPointDetectionResult](../../docs/api/vision_results/keypointdetection_result.md) |
-| FaceRecognition | 人脸识别，输入图像，返回可用于相似度计算的人脸特征的embedding            | [FaceRecognitionResult](../../docs/api/vision_results/face_recognition_result.md)   |
-| Matting | 抠图，输入图像，返回图片的前景每个像素点的Alpha值            | [MattingResult](../../docs/api/vision_results/matting_result.md)   |
-| OCR | 文本框检测，分类，文本框内容识别，输入图像，返回文本框坐标，文本框的方向类别以及框内的文本内容            | [OCRResult](../../docs/api/vision_results/ocr_result.md)   |
+| 任务类型              | 说明                                              | 预测结果结构体                                                                              |
+|:------------------|:------------------------------------------------|:-------------------------------------------------------------------------------------|
+| Detection         | 目标检测，输入图像，检测图像中物体位置，并返回检测框坐标及类别和置信度             | [DetectionResult](../../docs/api/vision_results/detection_result.md)                 |
+| Segmentation      | 语义分割，输入图像，给出图像中每个像素的分类及置信度                      | [SegmentationResult](../../docs/api/vision_results/segmentation_result.md)           |
+| Classification    | 图像分类，输入图像，给出图像的分类结果和置信度                         | [ClassifyResult](../../docs/api/vision_results/classification_result.md)             |
+| FaceDetection     | 人脸检测，输入图像，检测图像中人脸位置，并返回检测框坐标及人脸关键点              | [FaceDetectionResult](../../docs/api/vision_results/face_detection_result.md)        |
+| KeypointDetection | 关键点检测，输入图像，返回图像中人物行为的各个关键点坐标和置信度                | [KeyPointDetectionResult](../../docs/api/vision_results/keypointdetection_result.md) |
+| FaceRecognition   | 人脸识别，输入图像，返回可用于相似度计算的人脸特征的embedding             | [FaceRecognitionResult](../../docs/api/vision_results/face_recognition_result.md)    |
+| Matting           | 抠图，输入图像，返回图片的前景每个像素点的Alpha值                     | [MattingResult](../../docs/api/vision_results/matting_result.md)                     |
+| OCR               | 文本框检测，分类，文本框内容识别，输入图像，返回文本框坐标，文本框的方向类别以及框内的文本内容 | [OCRResult](../../docs/api/vision_results/ocr_result.md)                             |
+| MOT               | 多目标跟踪，输入图像，检测图像中物体位置，并返回检测框坐标，对象id及类别置信度        | [MOTResult](../../docs/api/vision_results/mot_result.md)                             |
+
 ## FastDeploy API设计
 
 视觉模型具有较有统一任务范式，在设计API时（包括C++/Python），FastDeploy将视觉模型的部署拆分为四个步骤
diff --git a/examples/vision/tracking/pptracking/cpp/infer.cc b/examples/vision/tracking/pptracking/cpp/infer.cc
index 709159eb42..58b4d4b615 100644
--- a/examples/vision/tracking/pptracking/cpp/infer.cc
+++ b/examples/vision/tracking/pptracking/cpp/infer.cc
@@ -33,25 +33,29 @@ void CpuInfer(const std::string& model_dir, const std::string& video_file) {
   }
 
   fastdeploy::vision::MOTResult result;
+  fastdeploy::vision::tracking::TrailRecorder recorder;
+  // during each prediction, data is inserted into the recorder. As the number of predictions increases,
+  // the memory will continue to grow. You can cancel the insertion through 'UnbindRecorder'.
+  // int count = 0; // unbind condition
+  model.BindRecorder(&recorder);
   cv::Mat frame;
-  int frame_id=0;
   cv::VideoCapture capture(video_file);
-  // according to the time of prediction to calculate fps
-  float fps= 0.0f;
   while (capture.read(frame)) {
     if (frame.empty()) {
-        break;
+      break;
     }
     if (!model.Predict(&frame, &result)) {
-        std::cerr << "Failed to predict." << std::endl;
-        return;
+      std::cerr << "Failed to predict." << std::endl;
+      return;
     }
+    // such as adding this code can cancel trail datat bind
+    // if(count++ == 10) model.UnbindRecorder();
     // std::cout << result.Str() << std::endl;
-    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, fps , frame_id);
+    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, 0.0, &recorder);
     cv::imshow("mot",out_img);
     cv::waitKey(30);
-    frame_id++;
   }
+  model.UnbindRecorder();
   capture.release();
   cv::destroyAllWindows();
 }
@@ -72,25 +76,29 @@ void GpuInfer(const std::string& model_dir, const std::string& video_file) {
   }
 
   fastdeploy::vision::MOTResult result;
+  fastdeploy::vision::tracking::TrailRecorder trail_recorder;
+  // during each prediction, data is inserted into the recorder. As the number of predictions increases,
+  // the memory will continue to grow. You can cancel the insertion through 'UnbindRecorder'.
+  // int count = 0; // unbind condition
+  model.BindRecorder(&trail_recorder);
   cv::Mat frame;
-  int frame_id=0;
   cv::VideoCapture capture(video_file);
-  // according to the time of prediction to calculate fps
-  float fps= 0.0f;
   while (capture.read(frame)) {
     if (frame.empty()) {
-        break;
+      break;
     }
     if (!model.Predict(&frame, &result)) {
-        std::cerr << "Failed to predict." << std::endl;
-        return;
+      std::cerr << "Failed to predict." << std::endl;
+      return;
     }
+    // such as adding this code can cancel trail datat bind
+    //if(count++ == 10) model.UnbindRecorder();
     // std::cout << result.Str() << std::endl;
-    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, fps , frame_id);
+    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, 0.0, &trail_recorder);
     cv::imshow("mot",out_img);
     cv::waitKey(30);
-    frame_id++;
   }
+  model.UnbindRecorder();
   capture.release();
   cv::destroyAllWindows();
 }
@@ -112,11 +120,13 @@ void TrtInfer(const std::string& model_dir, const std::string& video_file) {
   }
 
   fastdeploy::vision::MOTResult result;
+  fastdeploy::vision::tracking::TrailRecorder recorder;
+  //during each prediction, data is inserted into the recorder. As the number of predictions increases,
+  //the memory will continue to grow. You can cancel the insertion through 'UnbindRecorder'.
+  // int count = 0; // unbind condition
+  model.BindRecorder(&recorder);
   cv::Mat frame;
-  int frame_id=0;
   cv::VideoCapture capture(video_file);
-  // according to the time of prediction to calculate fps
-  float fps= 0.0f;
   while (capture.read(frame)) {
     if (frame.empty()) {
         break;
@@ -125,12 +135,14 @@ void TrtInfer(const std::string& model_dir, const std::string& video_file) {
         std::cerr << "Failed to predict." << std::endl;
         return;
     }
+    // such as adding this code can cancel trail datat bind
+    // if(count++ == 10) model.UnbindRecorder();
     // std::cout << result.Str() << std::endl;
-    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, fps , frame_id);
+    cv::Mat out_img = fastdeploy::vision::VisMOT(frame, result, 0.0, &recorder);
     cv::imshow("mot",out_img);
     cv::waitKey(30);
-    frame_id++;
   }
+  model.UnbindRecorder();
   capture.release();
   cv::destroyAllWindows();
 }
diff --git a/examples/vision/tracking/pptracking/python/infer.py b/examples/vision/tracking/pptracking/python/infer.py
index 39681e7e53..378d89bc14 100644
--- a/examples/vision/tracking/pptracking/python/infer.py
+++ b/examples/vision/tracking/pptracking/python/infer.py
@@ -14,7 +14,6 @@
 
 import fastdeploy as fd
 import cv2
-import time
 import os
 
 
@@ -60,20 +59,26 @@ config_file = os.path.join(args.model, "infer_cfg.yml")
 model = fd.vision.tracking.PPTracking(
     model_file, params_file, config_file, runtime_option=runtime_option)
 
+# 初始化轨迹记录器
+recorder = fd.vision.tracking.TrailRecorder()
+# 绑定记录器 注意：每次预测时，往trail_recorder里面插入数据，随着预测次数的增加，内存会不断地增长，
+# 可以通过unbind_recorder()方法来解除绑定
+model.bind_recorder(recorder)
 # 预测图片分割结果
 cap = cv2.VideoCapture(args.video)
-frame_id = 0
+# count = 0
 while True:
-    start_time = time.time()
-    frame_id = frame_id+1
     _, frame = cap.read()
     if frame is None:
         break
     result = model.predict(frame)
-    end_time = time.time()
-    fps = 1.0/(end_time-start_time)
-    img = fd.vision.vis_mot(frame, result, fps, frame_id)
+    # count += 1
+    # if count == 10:
+    #     model.unbind_recorder()
+    img = fd.vision.vis_mot(frame, result, 0.0, recorder)
     cv2.imshow("video", img)
-    cv2.waitKey(30)
+    if cv2.waitKey(30) == ord("q"):
+        break
+model.unbind_recorder()
 cap.release()
 cv2.destroyAllWindows()
diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h
index 9e613470f2..1acca31409 100755
--- a/fastdeploy/vision/common/result.h
+++ b/fastdeploy/vision/common/result.h
@@ -14,6 +14,7 @@
 #pragma once
 #include "fastdeploy/fastdeploy_model.h"
 #include "opencv2/core/core.hpp"
+#include <set>
 
 namespace fastdeploy {
 /** \brief All C++ FastDeploy Vision Models APIs are defined inside this namespace
@@ -171,6 +172,7 @@ struct FASTDEPLOY_DECL MOTResult : public BaseResult {
   /** \brief The classify label id for all the tracking object
    */
   std::vector<int> class_ids;
+
   ResultType type = ResultType::MOT;
   /// Clear MOT result
   void Clear();
diff --git a/fastdeploy/vision/tracking/pptracking/model.cc b/fastdeploy/vision/tracking/pptracking/model.cc
index 97d4e1ab94..0ae550ad24 100644
--- a/fastdeploy/vision/tracking/pptracking/model.cc
+++ b/fastdeploy/vision/tracking/pptracking/model.cc
@@ -161,9 +161,7 @@ bool PPTracking::Initialize() {
     return false;
   }
   // create JDETracker instance
-  std::unique_ptr<JDETracker> jdeTracker(new JDETracker);
-  jdeTracker_ = std::move(jdeTracker);
-
+  jdeTracker_ = std::unique_ptr<JDETracker>(new JDETracker);
   return true;
 }
 
@@ -245,7 +243,6 @@ bool PPTracking::Postprocess(std::vector<FDTensor>& infer_result, MOTResult *res
   cv::Mat dets(bbox_shape[0], 6, CV_32FC1, bbox_data);
   cv::Mat emb(bbox_shape[0], emb_shape[1], CV_32FC1, emb_data);
 
-
   result->Clear();
   std::vector<Track> tracks;
   std::vector<int> valid;
@@ -264,7 +261,6 @@ bool PPTracking::Postprocess(std::vector<FDTensor>& infer_result, MOTResult *res
     result->boxes.push_back(box);
     result->ids.push_back(1);
     result->scores.push_back(*dets.ptr<float>(0, 4));
-
   } else {
     std::vector<Track>::iterator titer;
     for (titer = tracks.begin(); titer != tracks.end(); ++titer) {
@@ -285,9 +281,36 @@ bool PPTracking::Postprocess(std::vector<FDTensor>& infer_result, MOTResult *res
       }
     }
   }
+  if (!is_record_trail_) return true;
+  int nums = result->boxes.size();
+  for (int i=0; i<nums; i++) {
+    float center_x = (result->boxes[i][0] + result->boxes[i][2]) / 2;
+    float center_y = (result->boxes[i][1] + result->boxes[i][3]) / 2;
+    int id = result->ids[i];
+    recorder_->Add(id,{int(center_x), int(center_y)});
+  }
   return true;
 }
 
+void PPTracking::BindRecorder(TrailRecorder* recorder){
+
+    recorder_ = recorder;
+    is_record_trail_ = true;
+}
+
+void PPTracking::UnbindRecorder(){
+
+    is_record_trail_ = false;
+    std::map<int, std::vector<std::array<int, 2>>>::iterator iter;
+    for(iter = recorder_->records.begin(); iter != recorder_->records.end(); iter++){
+      iter->second.clear();
+      iter->second.shrink_to_fit();
+    }
+    recorder_->records.clear();
+    std::map<int, std::vector<std::array<int, 2>>>().swap(recorder_->records);
+    recorder_ = nullptr;
+}
+
 } // namespace tracking
 } // namespace vision
 } // namespace fastdeploy
diff --git a/fastdeploy/vision/tracking/pptracking/model.h b/fastdeploy/vision/tracking/pptracking/model.h
index dc8f44f9d5..3d78d05fbc 100755
--- a/fastdeploy/vision/tracking/pptracking/model.h
+++ b/fastdeploy/vision/tracking/pptracking/model.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include "fastdeploy/vision/common/processors/transform.h"
 #include "fastdeploy/fastdeploy_model.h"
 #include "fastdeploy/vision/common/result.h"
@@ -22,6 +23,21 @@
 namespace fastdeploy {
 namespace vision {
 namespace tracking {
+struct TrailRecorder{
+  std::map<int, std::vector<std::array<int, 2>>> records;
+  void Add(int id, const std::array<int, 2>& record);
+};
+
+inline void TrailRecorder::Add(int id, const std::array<int, 2>& record) {
+  auto iter = records.find(id);
+  if (iter != records.end()) {
+    auto trail = records[id];
+    trail.push_back(record);
+    records[id] = trail;
+  } else {
+    records[id] = {record};
+  }
+}
 
 class FASTDEPLOY_DECL PPTracking: public FastDeployModel {
  public:
@@ -49,6 +65,14 @@ class FASTDEPLOY_DECL PPTracking: public FastDeployModel {
    * \return true if the prediction successed, otherwise false
    */
   virtual bool Predict(cv::Mat* img, MOTResult* result);
+  /** \brief bind tracking trail struct
+   *
+   * \param[in] recorder The MOT trail will record the trail of object
+   */
+  void BindRecorder(TrailRecorder* recorder);
+  /** \brief cancel binding and clear trail information
+   */
+  void UnbindRecorder();
 
  private:
   bool BuildPreprocessPipelineFromConfig();
@@ -65,8 +89,11 @@ class FASTDEPLOY_DECL PPTracking: public FastDeployModel {
   float conf_thresh_;
   float tracked_thresh_;
   float min_box_area_;
+  bool is_record_trail_ = false;
   std::unique_ptr<JDETracker> jdeTracker_;
+  TrailRecorder *recorder_ = nullptr;
 };
+
 }  // namespace tracking
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc b/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc
index d56437ad50..a5638628ed 100644
--- a/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc
+++ b/fastdeploy/vision/tracking/pptracking/pptracking_pybind.cc
@@ -15,6 +15,11 @@
 
 namespace fastdeploy {
 void BindPPTracking(pybind11::module &m) {
+
+  pybind11::class_<vision::tracking::TrailRecorder>(m, "TrailRecorder")
+    .def(pybind11::init<>())
+    .def_readwrite("records", &vision::tracking::TrailRecorder::records)
+    .def("add", &vision::tracking::TrailRecorder::Add);
   pybind11::class_<vision::tracking::PPTracking, FastDeployModel>(
     m, "PPTracking")
     .def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
@@ -26,6 +31,8 @@ void BindPPTracking(pybind11::module &m) {
              vision::MOTResult *res = new vision::MOTResult();
              self.Predict(&mat, res);
              return res;
-         });
+         })
+    .def("bind_recorder", &vision::tracking::PPTracking::BindRecorder)
+    .def("unbind_recorder", &vision::tracking::PPTracking::UnbindRecorder);
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/tracking/pptracking/trajectory.cc b/fastdeploy/vision/tracking/pptracking/trajectory.cc
index fd54b4c2e2..e666aa88b6 100644
--- a/fastdeploy/vision/tracking/pptracking/trajectory.cc
+++ b/fastdeploy/vision/tracking/pptracking/trajectory.cc
@@ -118,7 +118,7 @@ void Trajectory::update(Trajectory *traj,
   if (update_embedding_) update_embedding(traj->current_embedding);
 }
 
-void Trajectory::activate(int &cnt,int timestamp_) {
+void Trajectory::activate(int &cnt, int timestamp_) {
   id = next_id(cnt);
   TKalmanFilter::init(cv::Mat(xyah));
   length = 0;
@@ -130,7 +130,7 @@ void Trajectory::activate(int &cnt,int timestamp_) {
   starttime = timestamp_;
 }
 
-void Trajectory::reactivate(Trajectory *traj,int &cnt, int timestamp_, bool newid) {
+void Trajectory::reactivate(Trajectory *traj, int &cnt, int timestamp_, bool newid) {
   TKalmanFilter::correct(cv::Mat(traj->xyah));
   update_embedding(traj->current_embedding);
   length = 0;
diff --git a/fastdeploy/vision/tracking/pptracking/trajectory.h b/fastdeploy/vision/tracking/pptracking/trajectory.h
index a869f84099..793419ce13 100644
--- a/fastdeploy/vision/tracking/pptracking/trajectory.h
+++ b/fastdeploy/vision/tracking/pptracking/trajectory.h
@@ -74,8 +74,8 @@ class FASTDEPLOY_DECL Trajectory : public TKalmanFilter {
   virtual void update(Trajectory *traj,
                       int timestamp,
                       bool update_embedding = true);
-  virtual void activate(int& cnt, int timestamp);
-  virtual void reactivate(Trajectory *traj, int & cnt,int timestamp, bool newid = false);
+  virtual void activate(int &cnt, int timestamp);
+  virtual void reactivate(Trajectory *traj, int &cnt, int timestamp, bool newid = false);
   virtual void mark_lost(void);
   virtual void mark_removed(void);
 
diff --git a/fastdeploy/vision/visualize/mot.cc b/fastdeploy/vision/visualize/mot.cc
index 9877b2d4e0..a04fda8e7a 100644
--- a/fastdeploy/vision/visualize/mot.cc
+++ b/fastdeploy/vision/visualize/mot.cc
@@ -25,73 +25,63 @@ cv::Scalar GetMOTBoxColor(int idx) {
   return color;
 }
 
-
-cv::Mat VisMOT(const cv::Mat &img, const MOTResult &results, float fps, int frame_id) {
-
+cv::Mat VisMOT(const cv::Mat &img, const MOTResult &results,
+               float score_threshold, tracking::TrailRecorder* recorder) {
   cv::Mat vis_img = img.clone();
   int im_h = img.rows;
   int im_w = img.cols;
   float text_scale = std::max(1, static_cast<int>(im_w / 1600.));
   float text_thickness = 2.;
   float line_thickness = std::max(1, static_cast<int>(im_w / 500.));
-
-  std::ostringstream oss;
-  oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
-  oss << "frame: " << frame_id << " ";
-  oss << "fps: " << fps << " ";
-  oss << "num: " << results.boxes.size();
-  std::string text = oss.str();
-
-  cv::Point origin;
-  origin.x = 0;
-  origin.y = static_cast<int>(15 * text_scale);
-  cv::putText(vis_img,
-              text,
-              origin,
-              cv::FONT_HERSHEY_PLAIN,
-              text_scale,
-              cv::Scalar(0, 0, 255),
-              text_thickness);
-
   for (int i = 0; i < results.boxes.size(); ++i) {
-      const int obj_id = results.ids[i];
-      const float score = results.scores[i];
+    if (results.scores[i] < score_threshold) {
+        continue;
+    }
+    const int obj_id = results.ids[i];
+    const float score = results.scores[i];
+    cv::Scalar color = GetMOTBoxColor(obj_id);
+    if (recorder != nullptr){
+      int id = results.ids[i];
+      auto iter = recorder->records.find(id);
+      if (iter != recorder->records.end()) {
+        for (int j = 0; j < iter->second.size(); j++) {
+            cv::Point center(iter->second[j][0], iter->second[j][1]);
+            cv::circle(vis_img, center, text_thickness, color);
+        }
+      }
+    }
+    cv::Point pt1 = cv::Point(results.boxes[i][0], results.boxes[i][1]);
+    cv::Point pt2 = cv::Point(results.boxes[i][2], results.boxes[i][3]);
+    cv::Point id_pt =
+            cv::Point(results.boxes[i][0], results.boxes[i][1] + 10);
+    cv::Point score_pt =
+            cv::Point(results.boxes[i][0], results.boxes[i][1] - 10);
+    cv::rectangle(vis_img, pt1, pt2, color, line_thickness);
+    std::ostringstream idoss;
+    idoss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    idoss << obj_id;
+    std::string id_text = idoss.str();
 
-      cv::Scalar color = GetMOTBoxColor(obj_id);
+    cv::putText(vis_img,
+                id_text,
+                id_pt,
+                cv::FONT_HERSHEY_PLAIN,
+                text_scale,
+                color,
+                text_thickness);
 
-      cv::Point pt1 = cv::Point(results.boxes[i][0], results.boxes[i][1]);
-      cv::Point pt2 = cv::Point(results.boxes[i][2], results.boxes[i][3]);
-      cv::Point id_pt =
-              cv::Point(results.boxes[i][0], results.boxes[i][1] + 10);
-      cv::Point score_pt =
-              cv::Point(results.boxes[i][0], results.boxes[i][1] - 10);
-      cv::rectangle(vis_img, pt1, pt2, color, line_thickness);
+    std::ostringstream soss;
+    soss << std::setiosflags(std::ios::fixed) << std::setprecision(2);
+    soss << score;
+    std::string score_text = soss.str();
 
-      std::ostringstream idoss;
-      idoss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
-      idoss << obj_id;
-      std::string id_text = idoss.str();
-
-      cv::putText(vis_img,
-                  id_text,
-                  id_pt,
-                  cv::FONT_HERSHEY_PLAIN,
-                  text_scale,
-                  cv::Scalar(0, 255, 255),
-                  text_thickness);
-
-      std::ostringstream soss;
-      soss << std::setiosflags(std::ios::fixed) << std::setprecision(2);
-      soss << score;
-      std::string score_text = soss.str();
-
-      cv::putText(vis_img,
-                  score_text,
-                  score_pt,
-                  cv::FONT_HERSHEY_PLAIN,
-                  text_scale,
-                  cv::Scalar(0, 255, 255),
-                  text_thickness);
+    cv::putText(vis_img,
+                score_text,
+                score_pt,
+                cv::FONT_HERSHEY_PLAIN,
+                text_scale,
+                color,
+                text_thickness);
   }
   return vis_img;
 }
diff --git a/fastdeploy/vision/visualize/visualize.h b/fastdeploy/vision/visualize/visualize.h
index 2141a27647..d874409d0c 100644
--- a/fastdeploy/vision/visualize/visualize.h
+++ b/fastdeploy/vision/visualize/visualize.h
@@ -17,6 +17,8 @@
 
 #include "fastdeploy/vision/common/result.h"
 #include "opencv2/imgproc/imgproc.hpp"
+#include "fastdeploy/vision/tracking/pptracking/model.h"
+
 namespace fastdeploy {
 namespace vision {
 
@@ -81,8 +83,9 @@ FASTDEPLOY_DECL cv::Mat VisMatting(const cv::Mat& im,
                                    bool remove_small_connected_area = false);
 FASTDEPLOY_DECL cv::Mat VisOcr(const cv::Mat& im, const OCRResult& ocr_result);
 
-FASTDEPLOY_DECL cv::Mat VisMOT(const cv::Mat& img,const MOTResult& results, float fps=0.0, int frame_id=0);
-
+FASTDEPLOY_DECL cv::Mat VisMOT(const cv::Mat& img, const MOTResult& results,
+                               float score_threshold = 0.0f,
+                               tracking::TrailRecorder* recorder = nullptr);
 FASTDEPLOY_DECL cv::Mat SwapBackground(
     const cv::Mat& im, const cv::Mat& background, const MattingResult& result,
     bool remove_small_connected_area = false);
diff --git a/fastdeploy/vision/visualize/visualize_pybind.cc b/fastdeploy/vision/visualize/visualize_pybind.cc
index 8cf8b71654..7633579cc8 100644
--- a/fastdeploy/vision/visualize/visualize_pybind.cc
+++ b/fastdeploy/vision/visualize/visualize_pybind.cc
@@ -86,9 +86,9 @@ void BindVisualize(pybind11::module& m) {
              return TensorToPyArray(out);
            })
       .def("vis_mot",
-           [](pybind11::array& im_data, vision::MOTResult& result,float fps, int frame_id) {
+           [](pybind11::array& im_data, vision::MOTResult& result,float score_threshold, vision::tracking::TrailRecorder record) {
                auto im = PyArrayToCvMat(im_data);
-               auto vis_im = vision::VisMOT(im, result,fps,frame_id);
+               auto vis_im = vision::VisMOT(im, result, score_threshold, &record);
                FDTensor out;
                vision::Mat(vis_im).ShareWithTensor(&out);
                return TensorToPyArray(out);
@@ -185,9 +185,10 @@ void BindVisualize(pybind11::module& m) {
                     return TensorToPyArray(out);
                   })
       .def_static("vis_mot",
-                   [](pybind11::array& im_data, vision::MOTResult& result,float fps, int frame_id) {
+                   [](pybind11::array& im_data, vision::MOTResult& result,float score_threshold,
+                           vision::tracking::TrailRecorder* record) {
                        auto im = PyArrayToCvMat(im_data);
-                       auto vis_im = vision::VisMOT(im, result,fps,frame_id);
+                       auto vis_im = vision::VisMOT(im, result, score_threshold, record);
                        FDTensor out;
                        vision::Mat(vis_im).ShareWithTensor(&out);
                        return TensorToPyArray(out);
diff --git a/python/fastdeploy/vision/tracking/__init__.py b/python/fastdeploy/vision/tracking/__init__.py
index 946dfd9716..d21c975e97 100644
--- a/python/fastdeploy/vision/tracking/__init__.py
+++ b/python/fastdeploy/vision/tracking/__init__.py
@@ -12,5 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
-
+from ... import c_lib_wrap as C
 from .pptracking import PPTracking
+
+try:
+    TrailRecorder = C.vision.tracking.TrailRecorder
+except:
+    pass
diff --git a/python/fastdeploy/vision/tracking/pptracking/__init__.py b/python/fastdeploy/vision/tracking/pptracking/__init__.py
index 89ca2a7b01..d26b4ba1f4 100644
--- a/python/fastdeploy/vision/tracking/pptracking/__init__.py
+++ b/python/fastdeploy/vision/tracking/pptracking/__init__.py
@@ -48,3 +48,18 @@ class PPTracking(FastDeployModel):
         """
         assert input_image is not None, "The input image data is None."
         return self._model.predict(input_image)
+
+    def bind_recorder(self, val):
+        """ Binding tracking trail
+
+        :param val: (TrailRecorder) trail recorder, which is contained object's id and center point sequence
+        :return: None
+        """
+        self._model.bind_recorder(val)
+
+    def unbind_recorder(self):
+        """ cancel binding of tracking trail
+
+        :return:
+        """
+        self._model.unbind_recorder()
diff --git a/python/fastdeploy/vision/visualize/__init__.py b/python/fastdeploy/vision/visualize/__init__.py
index b7f7c7b141..ddbd8758e7 100755
--- a/python/fastdeploy/vision/visualize/__init__.py
+++ b/python/fastdeploy/vision/visualize/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import absolute_import
 import logging
 from ... import c_lib_wrap as C
+import cv2
 
 
 def vis_detection(im_data,
@@ -106,5 +107,5 @@ def vis_ppocr(im_data, det_result):
     return C.vision.vis_ppocr(im_data, det_result)
 
 
-def vis_mot(im_data, mot_result, fps, frame_id):
-    return C.vision.vis_mot(im_data, mot_result, fps, frame_id)
+def vis_mot(im_data, mot_result, score_threshold=0.0, records=None):
+    return C.vision.vis_mot(im_data, mot_result, score_threshold, records)

From 1db85e0ad91615e3cb4185950b7be9f41b46f5bf Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Thu, 3 Nov 2022 11:23:03 +0800
Subject: [PATCH 06/18] [Benchmark] Update benchmark (#488)

* add paddle_trt in benchmark

* update benchmark in device

* update benchmark
---
 benchmark/benchmark_ppcls.py | 16 +++-------------
 benchmark/benchmark_ppdet.py | 16 +++-------------
 benchmark/benchmark_ppseg.py | 16 +++-------------
 benchmark/benchmark_yolo.py  | 16 +++-------------
 4 files changed, 12 insertions(+), 52 deletions(-)

diff --git a/benchmark/benchmark_ppcls.py b/benchmark/benchmark_ppcls.py
index 914ace71b0..039a07cc9e 100755
--- a/benchmark/benchmark_ppcls.py
+++ b/benchmark/benchmark_ppcls.py
@@ -22,19 +22,9 @@ import GPUtil
 import time
 
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() == 'true':
-        return True
-    elif v.lower() == 'false':
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
 def parse_arguments():
     import argparse
+    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of PaddleClas model.")
@@ -64,12 +54,12 @@ def parse_arguments():
         help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
diff --git a/benchmark/benchmark_ppdet.py b/benchmark/benchmark_ppdet.py
index cb8d47f44e..6cabc4d4e9 100755
--- a/benchmark/benchmark_ppdet.py
+++ b/benchmark/benchmark_ppdet.py
@@ -24,19 +24,9 @@ import GPUtil
 import time
 
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() == 'true':
-        return True
-    elif v.lower() == 'false':
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
 def parse_arguments():
     import argparse
+    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of PaddleDetection model.")
@@ -66,12 +56,12 @@ def parse_arguments():
         help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
diff --git a/benchmark/benchmark_ppseg.py b/benchmark/benchmark_ppseg.py
index 2c7a37c2f1..ef57e37150 100755
--- a/benchmark/benchmark_ppseg.py
+++ b/benchmark/benchmark_ppseg.py
@@ -22,19 +22,9 @@ import GPUtil
 import time
 
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() == 'true':
-        return True
-    elif v.lower() == 'false':
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
 def parse_arguments():
     import argparse
+    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of PaddleSeg model.")
@@ -64,12 +54,12 @@ def parse_arguments():
         help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()
diff --git a/benchmark/benchmark_yolo.py b/benchmark/benchmark_yolo.py
index f534c43f3e..aa6927c833 100755
--- a/benchmark/benchmark_yolo.py
+++ b/benchmark/benchmark_yolo.py
@@ -24,19 +24,9 @@ import GPUtil
 import time
 
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() == 'true':
-        return True
-    elif v.lower() == 'false':
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
 def parse_arguments():
     import argparse
+    import ast
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", required=True, help="Path of Yolo onnx model.")
@@ -66,12 +56,12 @@ def parse_arguments():
         help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable fp16 in trt backend")
     parser.add_argument(
         "--enable_collect_memory_info",
-        type=str2bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable collect memory info")
     args = parser.parse_args()

From 9339d52cea2319afe94a44064bb88c3b6beabe85 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Thu, 3 Nov 2022 15:16:35 +0800
Subject: [PATCH 07/18] [Doc] add en readme for js  (#479)

* add contributor

* add package readme

* refine ocr readme

* refine ocr readme

* add en readme about js

* fix bugs

* refine doc
---
 examples/application/js/README.md             |   3 +-
 examples/application/js/README_en.md          |  40 ++++
 examples/application/js/WebDemo.md            |   2 +
 examples/application/js/WebDemo_en.md         | 176 ++++++++++++++++++
 .../application/js/mini_program/README.md     |  22 +--
 .../application/js/mini_program/README_en.md  | 125 +++++++++++++
 examples/application/js/package/README.md     |   2 +
 examples/application/js/package/README_en.md  |  41 ++++
 8 files changed, 399 insertions(+), 12 deletions(-)
 create mode 100644 examples/application/js/README_en.md
 create mode 100644 examples/application/js/WebDemo_en.md
 create mode 100644 examples/application/js/mini_program/README_en.md
 create mode 100644 examples/application/js/package/README_en.md

diff --git a/examples/application/js/README.md b/examples/application/js/README.md
index 23fd53e091..25fed0f1e2 100644
--- a/examples/application/js/README.md
+++ b/examples/application/js/README.md
@@ -1,3 +1,4 @@
+[English](README_en.md) | 简体中文
 
 # 前端AI应用
 
@@ -19,7 +20,7 @@
 |目标检测|[ScrewDetection、FaceDetection](./web_demo/src/pages/cv/detection/)| <img src="https://user-images.githubusercontent.com/26592129/196874536-b7fa2c0a-d71f-4271-8c40-f9088bfad3c9.png" height="200px">|
 |人像分割背景替换|[HumanSeg](./web_demo/src/pages/cv/segmentation/HumanSeg)|<img src="https://user-images.githubusercontent.com/26592129/196874452-4ef2e770-fbb3-4a35-954b-f871716d6669.png" height="200px">|
 |物体识别|[GestureRecognition、ItemIdentification](./web_demo/src/pages/cv/recognition/)|<img src="https://user-images.githubusercontent.com/26592129/196874416-454e6bb0-4ebd-4b51-a88a-8c40614290ae.png" height="200px">|
-|PP-OCRv3|[TextDetection、TextRecognition](./web_demo/src/pages/cv/ocr/)|<img src="https://user-images.githubusercontent.com/26592129/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png" height="200px">|
+|OCR|[TextDetection、TextRecognition](./web_demo/src/pages/cv/ocr/)|<img src="https://user-images.githubusercontent.com/26592129/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png" height="200px">|
 
 
 ## 微信小程序Demo使用
diff --git a/examples/application/js/README_en.md b/examples/application/js/README_en.md
new file mode 100644
index 0000000000..7506b9e8bc
--- /dev/null
+++ b/examples/application/js/README_en.md
@@ -0,0 +1,40 @@
+English | [简体中文](README.md)
+
+# Front-end AI application
+
+The development of artificial intelligence technology has led to industrial upgrading in the fields of computer vision(CV) and natural language processing(NLP). In addition, the deployment of AI models in browsers to achieve front-end intelligence has already provided good basic conditions with the steady increase in computing power on PCs and mobile devices, iterative updates of model compression technologies, and the continuous emergence of various innovative needs.
+In response to the difficulty of deploying AI deep learning models on the front-end, Baidu has open-sourced the Paddle.js front-end deep learning model deployment framework, which can easily deploy deep learning models into front-end projects.
+
+# Introduction of Paddle.js
+
+[Paddle.js](https://github.com/PaddlePaddle/Paddle.js) is a web sub-project of Baidu `PaddlePaddle`, an open source deep learning framework running in the browser. `Paddle.js` can load the deep learning model trained by `PaddlePaddle`, and convert it into a browser-friendly model through the model conversion tool `paddlejs-converter` of `Paddle.js`, which is easy to use for online reasoning and prediction. `Paddle.js` supports running in browsers of `WebGL/WebGPU/WebAssembly`, and can also run in the environment of Baidu applet and WeChat applet.
+
+Finally, we can launch AI functions in front-end application scenarios such as browsers and mini-program using `Paddle.js`, including but not limited to AI capabilities such as object detection, image segmentation, OCR, and item classification.
+
+## Web Demo
+
+Refer to this [document](./WebDemo_en.md) for steps to run computer vision demo in the browser.
+
+|demo|web demo directory|visualization|
+|-|-|-|
+|object detection|[ScrewDetection、FaceDetection](./web_demo/src/pages/cv/detection/)| <img src="https://user-images.githubusercontent.com/26592129/196874536-b7fa2c0a-d71f-4271-8c40-f9088bfad3c9.png" height="200px">|
+|human segmentation|[HumanSeg](./web_demo/src/pages/cv/segmentation/HumanSeg)|<img src="https://user-images.githubusercontent.com/26592129/196874452-4ef2e770-fbb3-4a35-954b-f871716d6669.png" height="200px">|
+|classification|[GestureRecognition、ItemIdentification](./web_demo/src/pages/cv/recognition/)|<img src="https://user-images.githubusercontent.com/26592129/196874416-454e6bb0-4ebd-4b51-a88a-8c40614290ae.png" height="200px">|
+|OCR|[TextDetection、TextRecognition](./web_demo/src/pages/cv/ocr/)|<img src="https://user-images.githubusercontent.com/26592129/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png" height="200px">|
+
+
+## Wechat Mini-program
+
+Run the official demo reference in the WeChat mini-program [document](./mini_program/README.md)
+
+|Name|Directory|
+|-|-|
+|OCR Text Detection| [ocrdetecXcx](./mini_program/ocrdetectXcx/) |
+|OCR Text Recognition| [ocrXcx](./mini_program/ocrXcx/) |
+|object detection| coming soon |
+|Image segmentation | coming soon |
+|Item Category| coming soon |
+
+## Contributor
+
+Thanks to Paddle Paddle Developer Expert (PPDE) Chen Qianhe (github: [chenqianhe](https://github.com/chenqianhe)) for the Web demo, mini-program.
\ No newline at end of file
diff --git a/examples/application/js/WebDemo.md b/examples/application/js/WebDemo.md
index a1928edf42..068a4b0017 100644
--- a/examples/application/js/WebDemo.md
+++ b/examples/application/js/WebDemo.md
@@ -1,3 +1,5 @@
+[English](WebDemo_en.md) | 简体中文
+
 # Web Demo介绍
 
 - [简介](#0)
diff --git a/examples/application/js/WebDemo_en.md b/examples/application/js/WebDemo_en.md
new file mode 100644
index 0000000000..9962ad163e
--- /dev/null
+++ b/examples/application/js/WebDemo_en.md
@@ -0,0 +1,176 @@
+English | [简体中文](WebDemo.md)
+
+# Introduction to Web Demo
+
+- [Introduction](#0)
+- [1. Quick Start](#1)
+- [2. npm package call](#2)
+- [3. Model Replacement](#3)
+- [4. custom hyperparameters](#4)
+- [5. Other](#5)
+
+<a name="0"></a>
+## Introduction
+
+Based on [Paddle.js](https://github.com/PaddlePaddle/Paddle.js), this project implements computer vision tasks such as target detection, portrait segmentation, OCR, and item classification in the browser.
+
+
+|demo name|web demo component|source directory|npm package|
+|-|-|-|-|
+|Face Detection|[FaceDetection](./web_demo/src/pages/cv/detection/FaceDetection/)| [facedetect](./package/packages/paddlejs-models/facedetect)|[@paddle-js-models/ facedetect](https://www.npmjs.com/package/@paddle-js-models/facedetect)|
+|Screw Detection|[ScrewDetection](./web_demo/src/pages/cv/detection/ScrewDetection)| [detect](./package/packages/paddlejs-models/detect)|[@paddle-js-models/detect](https://www.npmjs.com/package/@paddle-js-models/detect)|
+|Portrait segmentation background replacement|[HumanSeg](./web_demo/src/pages/cv/segmentation/HumanSeg)|[humanseg](./package/packages/paddlejs-models/humanseg)|[@paddle-js-models/ humanseg](https://www.npmjs.com/package/@paddle-js-models/humanseg)|
+|Gesture Recognition AI Guessing Shell|[GestureRecognition](./web_demo/src/pages/cv/recognition/GestureRecognition)|[gesture](./package/packages/paddlejs-models/gesture)|[@paddle-js- models/gesture](https://www.npmjs.com/package/@paddle-js-models/gesture)|
+|1000 Item Identification|[ItemIdentification](./web_demo/src/pages/cv/recognition/ItemIdentification)|[mobilenet](./package/packages/paddlejs-models/mobilenet)|[@paddle-js-models/ mobilenet](https://www.npmjs.com/package/@paddle-js-models/mobilenet)|
+|Text Detection|[TextDetection](./web_demo/src/pages/cv/ocr/TextDetection)|[ocrdetection](./package/packages/paddlejs-models/ocrdetection)|[@paddle-js-models/ocrdet](https://www.npmjs.com/package/@paddle-js-models/ocrdet)|
+|Text Recognition|[TextRecognition](./web_demo/src/pages/cv/ocr/TextRecognition)|[ocr](./package/packages/paddlejs-models/ocr)|[@paddle-js-models/ocr](https://www.npmjs.com/package/@paddle-js-models/ocr)|
+
+
+<a name="1"></a>
+## 1. Quick Start
+
+This section describes how to run the official demo directly in the browser.
+
+**1. Install Node.js**
+
+Download the `Node.js` installation package suitable for your platform from the `Node.js` official website https://nodejs.org/en/download/ and install it.
+
+**2. Install demo dependencies and start**
+Execute the following command in the `./web_demo` directory:
+
+````
+# install dependencies
+npm install
+# start demo
+npm run dev
+````
+
+Open the URL `http://localhost:5173/main/index.html` in the browser to quickly experience running computer vision tasks in the browser.
+
+![22416f4a3e7d63f950b838be3cd11e80](https://user-images.githubusercontent.com/26592129/196685868-93ab53bd-cb2e-44ff-a56b-50c1781b8679.jpg)
+
+
+<a name="2"></a>
+## 2. npm package call
+
+This section introduces how to use npm packages. Each demo provides an easy-to-use interface. Users only need to initialize and upload images to get the results. The steps are as follows:
+1. Call the module
+2. Initialize the model
+3. Pass in input, perform prediction
+
+Taking OCR as an example, in a front-end project, the `@paddle-js-models/ocr` package is used as follows:
+
+````
+// 1. Call the ocr module
+import * as ocr from '@paddle-js-models/ocr';
+
+// 2. Initialize the ocr model
+await ocr.init();
+
+// 3. Pass in an image of type HTMLImageElement as input and get the result
+const res = await ocr.recognize(img);
+
+// Print the text coordinates and text content obtained by the OCR model
+console.log(res.text);
+console.log(res.points);
+````
+
+<a name="3"></a>
+## 3. Model replacement
+
+Due to the limitations of the front-end environment and computing resources, when deploying deep learning models on the front-end, we have stricter requirements on the performance of the models. In short, the models need to be lightweight enough. In theory, the smaller the input shape of the model and the smaller the model size, the smaller the flops of the corresponding model, and the smoother the front-end operation. Based on experience, the model storage deployed with `Paddle.js` should not exceed *5M* as much as possible, and the actual situation depends on the hardware and computing resources.
+
+In practical applications, models are often customized according to vertical scenarios, and the official demo supports modifying incoming parameters to replace models.
+
+Take the OCR demo as an example, [ocr.init()function](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/package/packages/paddlejs-models/ocr/src/index.ts#L52), contains the default initialization model link, if you want to replace the model, please refer to the following steps.
+
+Step 1: Convert the model to js format:
+````
+# Install paddlejsconverter
+pip3 install paddlejsconverter
+# Convert the model format, the input model is the inference model
+paddlejsconverter --modelPath=./inference.pdmodel --paramPath=./inference.pdiparams --outputDir=./ --useGPUOpt=True
+# Note: The useGPUOpt option is not enabled by default. If the model is used on the gpu backend (webgl/webgpu), enable useGPUOpt. If the model is running on (wasm/plain js), do not enable it.
+````
+
+After the export is successful, files such as `model.json chunk_1.dat` will appear in the local directory, which are the network structure and model parameter binary files corresponding to the js model.
+
+Step 2: Upload the exported js model to a server that supports cross-domain access. For the CORS configuration of the server, refer to the following image:
+![image](https://user-images.githubusercontent.com/26592129/196612669-5233137a-969c-49eb-b8c7-71bef5088686.png)
+
+
+Step 3: Modify the code to replace the default model. Take the OCR demo as an example, modify the [model initialization code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/src/pages/cv/ocr/TextRecognition/TextRecognition.vue#L64) in the OCR web demo , i.e.
+
+````
+await ocr.init();
+change into:
+await ocr.init({modelPath: "https://js-models.bj.bcebos.com/PaddleOCR/PP-OCRv3/ch_PP-OCRv3_det_infer_js_960/model.json"}); # The first parameter passes in the new text Check dictionary type parameter
+````
+
+Re-execute the following command in the demo directory to experience the new model effect.
+````
+npm run dev
+````
+
+<a name="4"></a>
+## 4. custom hyperparameters
+
+**Custom preprocessing parameters**
+
+In different computer vision tasks, different models may have different preprocessing parameters, such as mean, std, keep_ratio and other parameters. After replacing the model, the preprocessing parameters also need to be modified. A simple solution for customizing preprocessing parameters is provided in the npm package published by paddle.js. You only need to pass in custom parameters when calling the model initialization function.
+
+````
+# Default parameter initialization
+await model.init();
+
+Custom parameter initialization
+const Config = {mean: [0.5, 0.5, 0.5], std: [0.5, 0.5, 0.5], keepratio: false};
+await model.init(Config);
+````
+
+Taking the OCR text detection demo as an example, to modify the mean and std parameters of the model preprocessing, you only need to pass in the custom mean and std parameters when the model is initialized.
+````
+await ocr.init();
+change into:
+const detConfig = {mean: [0.5, 0.5, 0.5], std: [0.5, 0.5, 0.5]};
+await ocr.init(detConfig); # The first parameter passes in the new text detection model link
+````
+
+**Custom postprocessing parameters**
+
+Similarly, the npm package published by paddle.js also provides a custom solution for post-processing parameters.
+
+````
+# run with default parameters
+await model.predict();
+
+# custom post-processing parameters
+const postConfig = {thresh: 0.5};
+await model.predict(Config);
+````
+
+Take the OCR text detection demo as an example, modify the parameters of the text detection post-processing to achieve the effect of expanding the text detection frame, and modify the OCR web demo to execute the [model prediction code](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/web_demo/src/pages/cv/ocr/TextRecognition/TextRecognition.vue#L99), ie:
+
+````
+const res = await ocr.recognize(img, { canvas: canvas.value });
+change into:
+// Define hyperparameters, increase the unclip_ratio parameter from 1.5 to 3.5
+const detConfig = {shape: 960, thresh: 0.3, box_thresh: 0.6, unclip_ratio:3.5};
+const res = await ocr.recognize(img, { canvas: canvas.value }, detConfig);
+````
+
+Note: Different tasks have different post-processing parameters. For detailed parameters, please refer to the API in the npm package.
+
+<a name="5"></a>
+## 5. Others
+
+The converted model of `Paddle.js` can not only be used in the browser, but also can be run in the Baidu mini-program and WeChat mini-program environment.
+
+|Name|Directory|
+|-|-|
+|OCR Text Detection| [ocrdetecXcx](./mini_program/ocrdetectXcx/) |
+|OCR Text Recognition| [ocrXcx](./mini_program/ocrXcx/) |
+|target detection| coming soon |
+| Image segmentation | coming soon |
+|Item Category| coming soon |
+
diff --git a/examples/application/js/mini_program/README.md b/examples/application/js/mini_program/README.md
index 92c6ae7a3a..2782f701e3 100644
--- a/examples/application/js/mini_program/README.md
+++ b/examples/application/js/mini_program/README.md
@@ -1,3 +1,4 @@
+[English](README_en.md) | 简体中文
 
 # Paddle.js微信小程序Demo
 
@@ -100,27 +101,26 @@ wx.canvasGetImageData({
 
 <a name="4"></a>
 ## 4. 常见问题
-### 4.1 出现报错 `Invalid context type [webgl2] for Canvas#getContext`
 
-可以不管，不影响正常代码运行和demo功能
+- 4.1 出现报错 `Invalid context type [webgl2] for Canvas#getContext`
 
-### 4.2 预览看不到结果
+    **答:** 可以不管，不影响正常代码运行和demo功能
 
-建议尝试真机调试
+- 4.2 预览看不到结果
 
-### 4.3 微信开发者工具出现黑屏，然后出现超多报错
+    **答:** 建议尝试真机调试
 
-重启微信开发者工具
+- 4.3 微信开发者工具出现黑屏，然后出现超多报错
 
-### 4.4 模拟和真机调试结果不一致；模拟检测不到文本等
+    **答:** 重启微信开发者工具
 
-可以以真机为准；
+- 4.4 模拟和真机调试结果不一致；模拟检测不到文本等
 
-模拟检测不到文本等可以尝试随意改动下代码（增删换行等）再点击编译
+    **答:** 可以以真机为准；模拟检测不到文本等可以尝试随意改动下代码（增删换行等）再点击编译
 
 
-### 4.5 手机调试或运行时出现 长时间无反应等提示
+- 4.5 手机调试或运行时出现 长时间无反应等提示
 
-请继续等待，模型推理需要一定时间
+    **答:** 请继续等待，模型推理需要一定时间
 
 
diff --git a/examples/application/js/mini_program/README_en.md b/examples/application/js/mini_program/README_en.md
new file mode 100644
index 0000000000..602ddf7885
--- /dev/null
+++ b/examples/application/js/mini_program/README_en.md
@@ -0,0 +1,125 @@
+English | [中文](README.md)
+
+# Paddle.js WeChat mini-program Demo
+
+- [1. Introduction](#1)
+- [2. Project Start](#2)
+  * [2.1 Preparations](#21)
+  * [2.2 Startup steps](#22)
+  * [2.3 visualization](#23)
+- [3. Model inference pipeline](#3)
+- [4. FAQ](#4)
+
+<a name="1"></a>
+## 1 Introduction
+
+
+This directory contains the text detection, text recognition mini-program demo, by using [Paddle.js](https://github.com/PaddlePaddle/Paddle.js) and [Paddle.js WeChat mini-program plugin](https://mp.weixin.qq.com/wxopen/plugindevdoc?appid=wx7138a7bb793608c3&token=956931339&lang=zh_CN) to complete the text detection frame selection effect on the mini-program using the computing power of the user terminal.
+
+<a name="2"></a>
+## 2. Project start
+
+<a name="21"></a>
+### 2.1 Preparations
+* [Apply for a WeChat mini-program account](https://mp.weixin.qq.com/)
+* [WeChat Mini Program Developer Tools](https://developers.weixin.qq.com/miniprogram/dev/devtools/download.html)
+* Front-end development environment preparation: node, npm
+* Configure the server domain name in the mini-program management background, or open the developer tool [do not verify the legal domain name]
+
+For details, please refer to [document.](https://mp.weixin.qq.com/wxamp/devprofile/get_profile?token=1132303404&lang=zh_CN)
+
+<a name="22"></a>
+### 2.2 Startup steps
+
+#### **1. Clone the demo code**
+````sh
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy/examples/application/js/mini_program
+````
+
+#### **2. Enter the mini-program directory and install dependencies**
+
+````sh
+# Run the text recognition demo and enter the ocrXcx directory
+cd ./ocrXcx && npm install
+# Run the text detection demo and enter the ocrdetectXcx directory
+# cd ./ocrdetectXcx && npm install
+````
+
+#### **3. WeChat mini-program import code**
+Open WeChat Developer Tools --> Import --> Select a directory and enter relevant information
+
+#### **4. Add Paddle.js WeChat mini-program plugin**
+Mini Program Management Interface --> Settings --> Third Party Settings --> Plugin Management --> Add Plugins --> Search for `wx7138a7bb793608c3` and add
+[Reference document](https://developers.weixin.qq.com/miniprogram/dev/framework/plugin/using.html)
+
+#### **5. Build dependencies**
+Click on the menu bar in the developer tools: Tools --> Build npm
+
+Reason: The node_modules directory will not be involved in compiling, uploading and packaging. If a small program wants to use npm packages, it must go through the process of "building npm". After the construction is completed, a miniprogram_npm directory will be generated, which will store the built and packaged npm packages. It is the npm package that the mini-program actually uses. *
+[Reference Documentation](https://developers.weixin.qq.com/miniprogram/dev/devtools/npm.html)
+
+<a name="23"></a>
+### 2.3 visualization
+
+<img src="https://user-images.githubusercontent.com/43414102/157648579-cdbbee61-9866-4364-9edd-a97ac0eda0c1.png" width="300px">
+
+<a name="3"></a>
+## 3. Model inference pipeline
+
+```typescript
+// Introduce paddlejs and paddlejs-plugin, register the mini-program environment variables and the appropriate backend
+import * as paddlejs from '@paddlejs/paddlejs-core';
+import '@paddlejs/paddlejs-backend-webgl';
+const plugin = requirePlugin('paddlejs-plugin');
+plugin.register(paddlejs, wx);
+
+// Initialize the inference engine
+const runner = new paddlejs.Runner({modelPath, feedShape, mean, std});
+await runner.init();
+
+// get image information
+wx.canvasGetImageData({
+    canvasId: canvasId,
+    x: 0,
+    y: 0,
+    width: canvas.width,
+    height: canvas.height,
+    success(res) {
+        // inference prediction
+        runner.predict({
+            data: res.data,
+            width: canvas.width,
+            height: canvas.height,
+        }, function (data) {
+            // get the inference result
+            console.log(data)
+        });
+    }
+});
+````
+
+<a name="4"></a>
+## 4. FAQ
+
+- 4.1 An error occurs `Invalid context type [webgl2] for Canvas#getContext`
+
+    **A:** You can leave it alone, it will not affect the normal code operation and demo function
+
+- 4.2 Preview can't see the result
+
+    **A:** It is recommended to try real machine debugging
+
+- 4.3 A black screen appears in the WeChat developer tool, and then there are too many errors
+
+    **A:** Restart WeChat Developer Tools
+
+- 4.4 The debugging results of the simulation and the real machine are inconsistent; the simulation cannot detect the text, etc.
+
+    **A:** The real machine can prevail;
+    If the simulation cannot detect the text, etc., you can try to change the code at will (add, delete, newline, etc.) and then click to compile
+
+
+- 4.5 Prompts such as no response for a long time appear when the phone is debugged or running
+
+    **A:** Please continue to wait, model inference will take some time
\ No newline at end of file
diff --git a/examples/application/js/package/README.md b/examples/application/js/package/README.md
index f2430fd2df..35aa73da62 100644
--- a/examples/application/js/package/README.md
+++ b/examples/application/js/package/README.md
@@ -1,3 +1,5 @@
+[English](README_en.md) | 简体中文
+
 # Paddle.js Model Module介绍
 
 该部分是基于 Paddle.js 进行开发的模型库，主要提供 Web 端可直接引入使用模型的能力。
diff --git a/examples/application/js/package/README_en.md b/examples/application/js/package/README_en.md
new file mode 100644
index 0000000000..77243b6b98
--- /dev/null
+++ b/examples/application/js/package/README_en.md
@@ -0,0 +1,41 @@
+English | [简体中文](README.md)
+
+# Introduction to Paddle.js Demo Module
+
+This part is a model library developed based on Paddle.js, which mainly provides the ability to directly introduce and use models on the web side.
+
+| demo name | source directory | npm package |
+| - | - | - |
+| face detection | [facedetect](./packages/paddlejs-models/facedetect) | [@paddle-js-models/facedetect](https://www.npmjs.com/package/@paddle-js-models/facedetect) |
+| Screw detection | [detect](./packages/paddlejs-models/detect) | [@paddle-js-models/detect](https://www.npmjs.com/package/@paddle-js-models/detect ) |
+| Portrait segmentation background replacement | [humanseg](./packages/paddlejs-models/humanseg) | [@paddle-js-models/humanseg](https://www.npmjs.com/package/@paddle-js-models/humanseg) |
+| Gesture Recognition AI Guessing Shell | [gesture](./packages/paddlejs-models/gesture) | [@paddle-js-models/gesture](https://www.npmjs.com/package/@paddle-js-models/gesture) |
+| 1000 Item Recognition | [mobilenet](./packages/paddlejs-models/mobilenet) | [@paddle-js-models/mobilenet](https://www.npmjs.com/package/@paddle-js-models/mobilenet) |
+| Text Detection | [ocrdetection](./packages/paddlejs-models/ocrdetection) | [@paddle-js-models/ocrdet](https://www.npmjs.com/package/@paddle-js-models/ocrdet ) |
+| Text Recognition | [ocr](./packages/paddlejs-models/ocr) | [@paddle-js-models/ocr](https://www.npmjs.com/package/@paddle-js-models/ocr) |
+
+## Usage
+
+This part is Menorepo built with `pnpm`
+
+### Install dependencies
+
+````sh
+pnpm i
+````
+
+### Development
+See Package.json for development testing with `yalc`.
+
+````sh
+pnpm run dev:xxx
+````
+
+### Overall Introduction
+
+1. Use rollup to package the code of commonjs and es specifications at one time; at the same time, it is extensible; at present, there are some problems with the dependent cv library; there is no configuration for umd packaging.
+2. The d.ts file is generated based on api-extractor during packaging, and the introduction of ts is supported to generate our package
+3. Support testing based on jest and display test related coverage, etc.
+4. Maintain code style based on ts and eslint to ensure better code development
+5. Generate custom keywords based on conventional-changelog-cli and generate changelog accordingly
+6. Implement local packaging development and testing based on yalc
\ No newline at end of file

From 6052d6e72b90a858ff13b582f7f7221dc1845db8 Mon Sep 17 00:00:00 2001
From: leiqing <54695910+leiqing1@users.noreply.github.com>
Date: Thu, 3 Nov 2022 16:45:11 +0800
Subject: [PATCH 08/18] Update README_EN.md

---
 README_EN.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_EN.md b/README_EN.md
index 17beac5306..b7cded7639 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -197,6 +197,7 @@ Notes: ✅: already supported; ❔: to be supported in the future; ❌: not supp
 | <font size=2> Detection | <font size=2> [PaddleDetection/PP-YOLO](./examples/vision/detection/paddledetection) | <font size=2> [Python](./examples/vision/detection/paddledetection/python)/[C++](./examples/vision/detection/paddledetection/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ❌ | ❌ | ❌ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [PaddleDetection/PP-YOLOv2](./examples/vision/detection/paddledetection) | <font size=2> [Python](./examples/vision/detection/paddledetection/python)/[C++](./examples/vision/detection/paddledetection/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ❌ | ❌ | ❌ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [PaddleDetection/FasterRCNN](./examples/vision/detection/paddledetection) | <font size=2> [Python](./examples/vision/detection/paddledetection/python)/[C++](./examples/vision/detection/paddledetection/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ | ❌ | ❌ | ❌ | ❌ |❔|❔|
+| <font size=2> Detection | <font size=2> [PaddleDetection/PP-Tracking](./examples/vision/tracking/pptracking) | <font size=2> [Python](./examples/vision/tracking/pptracking/python)/[C++](./examples/vision/tracking/pptracking/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ | ❌ | ❌ | ❌ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [Megvii-BaseDetection/YOLOX](./examples/vision/detection/yolox) | <font size=2> [Python](./examples/vision/detection/yolox/python)/[C++](./examples/vision/detection/yolox/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [WongKinYiu/YOLOv7](./examples/vision/detection/yolov7) | <font size=2> [Python](./examples/vision/detection/yolov7/python)/[C++](./examples/vision/detection/yolov7/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Detection | <font size=2> [meituan/YOLOv6](./examples/vision/detection/yolov6) | <font size=2> [Python](./examples/vision/detection/yolov6/python)/[C++](./examples/vision/detection/yolov6/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
@@ -222,6 +223,7 @@ Notes: ✅: already supported; ❔: to be supported in the future; ❌: not supp
 | <font size=2> Face Recognition | <font size=2> [insightface/PartialFC](./examples/vision/faceid/insightface) | <font size=2> [Python](./examples/vision/faceid/insightface/python)/[C++](./examples/vision/faceid/insightface/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Face Recognition | <font size=2> [insightface/VPL](./examples/vision/faceid/insightface) | <font size=2> [Python](./examples/vision/faceid/insightface/python)/[C++](./examples/vision/faceid/insightface/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅ | ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [ZHKKKe/MODNet](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/modnet/python)/[C++](./examples/vision/matting/modnet/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
+| <font size=2> Matting | <font size=2> [eterL1n/RobustVideoMatting](./examples/vision/matting/rvm) | <font size=2> [Python](./examples/vision/matting/rvm/python)/[C++](./examples/vision/matting/rvm/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/PP-Matting](./examples/vision/matting/ppmatting) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/ModNet](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|

From 696c75412e5c2c524eb23b4c996cd18fe14cf486 Mon Sep 17 00:00:00 2001
From: leiqing <54695910+leiqing1@users.noreply.github.com>
Date: Thu, 3 Nov 2022 17:05:44 +0800
Subject: [PATCH 09/18] Update README_EN.md

---
 README_EN.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_EN.md b/README_EN.md
index b7cded7639..852190b02c 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -228,6 +228,8 @@ Notes: ✅: already supported; ❔: to be supported in the future; ❌: not supp
 | <font size=2> Matting | <font size=2> [PaddleSeg/PP-HumanMatting](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Matting | <font size=2> [PaddleSeg/ModNet](./examples/vision/matting/modnet) | <font size=2> [Python](./examples/vision/matting/ppmatting/python)/[C++](./examples/vision/matting/ppmatting/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 | <font size=2> Information Extraction | <font size=2> [PaddleNLP/UIE](./examples/text/uie) | <font size=2> [Python](./examples/text/uie/python)/[C++](./examples/text/uie/cpp) |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
+| <font size=2>Text Classification | <font size=2> [PaddleNLP/Ernie-3.0](./examples/text/ernie-3.0) | <font size=2> Python/C++ |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
+| <font size=2> Text-to-Speech| <font size=2> [PaddleSpeech/PP-TTS](./examples/audio/pp-tts) | <font size=2> [Python](./examples/audio/pp-tts/python)/C++ |  ✅       |  ✅    |  ✅     |  ✅    |  ✅ |  ✅ |  ✅ |  ✅| ❌ |❔|❔|
 
 ## Edge-Side Deployment
 

From 5b1fecd66813b1e3041b3e4b5042186f316a96e1 Mon Sep 17 00:00:00 2001
From: leiqing <54695910+leiqing1@users.noreply.github.com>
Date: Thu, 3 Nov 2022 22:29:08 +0800
Subject: [PATCH 10/18] Update README_EN.md

---
 README_EN.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/README_EN.md b/README_EN.md
index 852190b02c..e03dc65408 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -30,22 +30,22 @@ English | [简体中文](README_CN.md)
 
 ## 📣 Recent Updates
 
-- 🔥 **2022.10.15：Release FastDeploy [release v0.3.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.3.0)** <br>
+- 🔥 **2022.10.31：Release FastDeploy [release v0.5.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.5.0)** <br>
+    -  **New deployment upgrade: Support support more backend, support more cv models**
+        -  Support Paddle Inference TensorRT, and provide a seamless deployment experience with other inference engines include TensorRT、OpenVINO、ONNX Runtime、Paddle Lite、Paddle Inference；
+        -  Support Graphcore IPU through paddle Inference;
+        -  Support tracking model [PP-Tracking](./examples/vision/tracking/pptracking) and [RobustVideoMatting](./examples/vision/matting) model
+
+- 🔥 **2022.10.24：Release FastDeploy [release v0.4.0](https://github.com/PaddlePaddle/FastDeploy/tree/release/0.4.0)** <br>
   - **New server-side deployment upgrade: support more CV model and NLP model**
-       - Integrate OpenVINO and provide a seamless deployment experience with other inference engines include TensorRT、ONNX Runtime、Paddle Inference；
-       - Support [one-click model quantization](tools/quantization) to improve model inference speed by 1.5 to 2 times on CPU & GPU platform. The supported quantized model are YOLOv7, YOLOv6, YOLOv5, etc. 
+       - Integrate Paddle Lite and provide a seamless deployment experience with other inference engines include TensorRT、OpenVINO、ONNX Runtime、Paddle Inference；
+       - Support [Lightweight Detection Model](examples/vision/detection/paddledetection/android) and [classification model](examples/vision/classification/paddleclas/android) on Android Platform，Download to try it out
+       - end-to-end optimization on GPU, [YOLO series](examples/vision/detection) model end-to-end inference speedup from 43ms to 25ms;<br>
+       - Web deployment and Mini Program deployment New [OCR and other CV models](examples/application/js) capability；
+       - Support [TinyPose](examples/vision/keypointdetection/tiny_pose) and [PicoDet+TinyPose](examples/vision/keypointdetection/det_keypoint_unite)Pipeline deployment；
        - New CV models include PP-OCRv3, PP-OCRv2, PP-TinyPose, PP-Matting, etc. and provides [end-to-end deployment demos](examples/vision/detection/)
        - New information extraction model is UIE, and provides [end-to-end deployment demos](examples/text/uie).
 
-- 🔥 **2022.8.18：Release FastDeploy [release v0.2.0](https://github.com/PaddlePaddle/FastDeploy/tree/release%2F0.2.0)** <br>
-  - **New server-side deployment upgrade: faster inference performance, support more CV model**
-    - Release high-performance inference engine SDK based on x86 CPUs and NVIDIA GPUs, with significant increase in inference speed
-    - Integrate Paddle Inference, ONNX Runtime, TensorRT and other inference engines and provide a seamless deployment experience
-    - Supports full range of object detection models such as YOLOv7, YOLOv6, YOLOv5, PP-YOLOE and provides [end-to-end deployment demos](examples/vision/detection/)
-    - Support over 40 key models and [demo examples](examples/vision/) including face detection, face recognition, real-time portrait matting, image segmentation.
-    - Support deployment in both Python and C++
-  - **Supports Rockchip, Amlogic, NXP and other NPU chip deployment capabilities on edge device deployment**
-    - Release Lightweight Object Detection [Picodet-NPU deployment demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/develop/object_detection/linux/picodet_detection), providing the full quantized inference capability for INT8.
 
 ## Contents
 

From a36d49a973a303a2d0524971044745a807fe7d36 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 4 Nov 2022 09:19:03 +0800
Subject: [PATCH 11/18] [FlyCV] optimize the integration of FlyCV (#492)

* [Backend] fix lite backend save model error

* [Backend] fixed typos

* [FlyCV] optimize the integration of FlyCV

* [cmake] close some tests options

* [cmake] close some test option

* [FlyCV] remove un-need warnings

* [FlyCV] remove un-need GetMat method

* [FlyCV] optimize FlyCV codes

* [cmake] remove un-need cmake function in examples/CMakelists

* [cmake] support gflags for Android
---
 .gitignore                                    |   2 +
 cmake/gflags.cmake                            |  95 +++++++++-----
 examples/CMakeLists.txt                       |  33 ++---
 .../vision/classification/ppcls/model.cc      |   3 +-
 fastdeploy/vision/common/processors/base.cc   | 115 +++++++++++++++--
 fastdeploy/vision/common/processors/base.h    |  21 +--
 fastdeploy/vision/common/processors/cast.cc   |  10 +-
 fastdeploy/vision/common/processors/cast.h    |   8 +-
 .../vision/common/processors/center_crop.cc   |   8 +-
 .../vision/common/processors/center_crop.h    |   4 +-
 .../common/processors/color_space_convert.cc  |  14 +-
 .../common/processors/color_space_convert.h   |   8 +-
 .../vision/common/processors/convert.cc       |  11 +-
 fastdeploy/vision/common/processors/convert.h |   4 +-
 fastdeploy/vision/common/processors/crop.cc   |   4 +-
 fastdeploy/vision/common/processors/crop.h    |   4 +-
 .../vision/common/processors/hwc2chw.cc       |  18 ++-
 fastdeploy/vision/common/processors/hwc2chw.h |   4 +-
 .../vision/common/processors/letter_box.h     |   6 +-
 .../common/processors/limit_by_stride.cc      |   8 +-
 .../common/processors/limit_by_stride.h       |   4 +-
 .../vision/common/processors/limit_long.cc    |  12 +-
 .../vision/common/processors/limit_long.h     |   4 +-
 .../vision/common/processors/limit_short.cc   |   9 +-
 .../vision/common/processors/limit_short.h    |   4 +-
 fastdeploy/vision/common/processors/mat.cc    | 120 ++++++------------
 fastdeploy/vision/common/processors/mat.h     |  79 +++++++-----
 .../vision/common/processors/normalize.cc     |  20 +--
 .../vision/common/processors/normalize.h      |  12 +-
 .../processors/normalize_and_permute.cc       |  36 +++---
 .../common/processors/normalize_and_permute.h |  11 +-
 fastdeploy/vision/common/processors/pad.cc    |  13 +-
 fastdeploy/vision/common/processors/pad.h     |   4 +-
 .../vision/common/processors/pad_to_size.cc   |   9 +-
 .../vision/common/processors/pad_to_size.h    |   4 +-
 fastdeploy/vision/common/processors/resize.cc |  13 +-
 fastdeploy/vision/common/processors/resize.h  |   4 +-
 .../vision/common/processors/resize_by_long.h |   2 +-
 .../common/processors/resize_by_short.cc      |   6 +-
 .../common/processors/resize_by_short.h       |   4 +-
 .../vision/common/processors/stride_pad.cc    |   7 +-
 .../vision/common/processors/stride_pad.h     |   4 +-
 .../vision/common/processors/transform.h      |  52 ++++----
 fastdeploy/vision/common/processors/utils.cc  |  67 ++++++++--
 fastdeploy/vision/common/processors/utils.h   |  14 +-
 .../vision/common/processors/warp_affine.h    |  15 +--
 fastdeploy/vision/detection/contrib/yolov5.cc |  49 +++----
 fastdeploy/vision/detection/contrib/yolov6.cc |  30 +++--
 fastdeploy/vision/detection/contrib/yolov7.cc |  30 +++--
 fastdeploy/vision/matting/contrib/modnet.cc   |   7 +-
 fastdeploy/vision/matting/contrib/rvm.cc      |  33 +++--
 .../vision/matting/ppmatting/ppmatting.cc     |  14 +-
 fastdeploy/vision/segmentation/ppseg/model.cc |  31 +++--
 .../vision/tracking/pptracking/trajectory.h   |   9 +-
 fastdeploy/vision/utils/crop_image.cc         |  14 +-
 fastdeploy/vision/utils/utils.h               |  31 +++--
 56 files changed, 670 insertions(+), 487 deletions(-)
 mode change 100755 => 100644 fastdeploy/vision/common/processors/letter_box.h
 mode change 100755 => 100644 fastdeploy/vision/matting/contrib/modnet.cc
 mode change 100755 => 100644 fastdeploy/vision/matting/contrib/rvm.cc
 mode change 100755 => 100644 fastdeploy/vision/matting/ppmatting/ppmatting.cc
 mode change 100755 => 100644 fastdeploy/vision/segmentation/ppseg/model.cc

diff --git a/.gitignore b/.gitignore
index db38f7705d..2edbc3103c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,5 @@ coverage
 *.local
 yalc.*
 .yalc
+examples/vision/collect_quantize_cc.sh
+examples/vision/tests_quantize
\ No newline at end of file
diff --git a/cmake/gflags.cmake b/cmake/gflags.cmake
index 3402383cb7..08d5e49c81 100644
--- a/cmake/gflags.cmake
+++ b/cmake/gflags.cmake
@@ -38,36 +38,71 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 
-ExternalProject_Add(
-    extern_gflags
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
-    GIT_TAG         ${GFLAGS_TAG}
-    PREFIX          ${GFLAGS_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    BUILD_COMMAND   ${BUILD_COMMAND}
-    INSTALL_COMMAND ${INSTALL_COMMAND}
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DBUILD_STATIC_LIBS=ON
-                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
-)
-
+if(ANDROID)
+  ExternalProject_Add(
+      extern_gflags
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
+      GIT_TAG         ${GFLAGS_TAG}
+      PREFIX          ${GFLAGS_PREFIX_DIR}
+      UPDATE_COMMAND  ""
+      BUILD_COMMAND   ${BUILD_COMMAND}
+      INSTALL_COMMAND ${INSTALL_COMMAND}
+      CMAKE_ARGS      -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+                      -DANDROID_ABI=${ANDROID_ABI}
+                      -DANDROID_NDK=${ANDROID_NDK}
+                      -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                      -DANDROID_STL=c++_static
+                      -DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN}
+                      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                      -DBUILD_STATIC_LIBS=ON
+                      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                      -DBUILD_TESTING=OFF
+                      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                      ${EXTERNAL_OPTIONAL_ARGS}
+     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}                
+      BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
+  )
+else()
+  ExternalProject_Add(
+      extern_gflags
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
+      GIT_TAG         ${GFLAGS_TAG}
+      PREFIX          ${GFLAGS_PREFIX_DIR}
+      UPDATE_COMMAND  ""
+      BUILD_COMMAND   ${BUILD_COMMAND}
+      INSTALL_COMMAND ${INSTALL_COMMAND}
+      CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                      -DBUILD_STATIC_LIBS=ON
+                      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                      -DBUILD_TESTING=OFF
+                      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                      ${EXTERNAL_OPTIONAL_ARGS}
+      CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+      BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
+  )
+endif()
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 1f14fa8897..e0d99a30ab 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,12 +37,6 @@ function(config_fastdeploy_executable_link_flags TARGET_NAME)
   endif()
 endfunction()
 
-# Usage: add_fastdeploy_executable_cc_files(xxx_var vision detection)
-function(add_fastdeploy_executable_cc_files CC_FILES_VAR FIELD SUB_FIELD)
-  file(GLOB_RECURSE _EXAMPLE_SRCS ${PROJECT_SOURCE_DIR}/examples/${FIELD}/${SUB_FIELD}/*/cpp/*.cc)
-  set(${CC_FILES_VAR} ${_EXAMPLE_SRCS} PARENT_SCOPE)
-endfunction()
-
 set(EXAMPLES_NUM 0)
 function(add_fastdeploy_executable FIELD CC_FILE)
   # temp target name/file var in function scope
@@ -55,7 +49,11 @@ function(add_fastdeploy_executable FIELD CC_FILE)
     add_executable(${TEMP_TARGET_NAME} ${TEMP_TARGET_FILE})
     target_link_libraries(${TEMP_TARGET_NAME} PUBLIC fastdeploy)
     if(TARGET gflags)
-      target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags pthread)
+      if(NOT ANDROID)
+        target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags pthread)
+      else()
+        target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags)
+      endif()
     endif()
     config_fastdeploy_executable_link_flags(${TEMP_TARGET_NAME})
     math(EXPR _EXAMPLES_NUM "${EXAMPLES_NUM} + 1")
@@ -78,22 +76,13 @@ if(BUILD_EXAMPLES AND ENABLE_VISION)
   if(EXISTS ${PROJECT_SOURCE_DIR}/examples/vision)
     message(STATUS "")
     message(STATUS "*************FastDeploy Vision Examples Summary**********")
-    set(ALL_VISION_SUD_FIELDS classification
-                              detection
-                              facedet
-                              faceid
-                              keypointdetection
-                              matting
-                              ocr
-                              segmentation)
-    if(NOT ANDROID)
-      list(APPEND ALL_VISION_SUD_FIELDS tracking)
+    file(GLOB_RECURSE ALL_VISION_EXAMPLE_SRCS ${PROJECT_SOURCE_DIR}/examples/vision/*/*/cpp/*.cc)
+    if(ANDROID)
+      file(GLOB_RECURSE TRACKING_SRCS ${PROJECT_SOURCE_DIR}/examples/vision/tracking/*/cpp/*.cc)
+      list(REMOVE_ITEM ALL_VISION_EXAMPLE_SRCS ${TRACKING_SRCS})
     endif()
-    foreach(_SUB_FIELD ${ALL_VISION_SUD_FIELDS})
-      add_fastdeploy_executable_cc_files(_SUB_CC_FILES vision ${_SUB_FIELD})
-      foreach(_CC_FILE ${_SUB_CC_FILES})
-        add_fastdeploy_executable(vision ${_CC_FILE})
-      endforeach()
+    foreach(_CC_FILE ${ALL_VISION_EXAMPLE_SRCS})
+      add_fastdeploy_executable(vision ${_CC_FILE})
     endforeach()
     message(STATUS "  [FastDeploy Executable Path]        : ${EXECUTABLE_OUTPUT_PATH}")
   endif()
diff --git a/fastdeploy/vision/classification/ppcls/model.cc b/fastdeploy/vision/classification/ppcls/model.cc
index 57e3541117..3eed25c6cd 100644
--- a/fastdeploy/vision/classification/ppcls/model.cc
+++ b/fastdeploy/vision/classification/ppcls/model.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/classification/ppcls/model.h"
+
 #include "fastdeploy/vision/utils/utils.h"
 #include "yaml-cpp/yaml.h"
 
@@ -108,7 +109,7 @@ bool PaddleClasModel::Preprocess(Mat* mat, FDTensor* output) {
   int height = mat->Height();
   output->name = InputInfoOfRuntime(0).name;
   output->SetExternalData({1, channel, height, width}, FDDataType::FP32,
-                          mat->GetOpenCVMat()->ptr());
+                          mat->Data());
   return true;
 }
 
diff --git a/fastdeploy/vision/common/processors/base.cc b/fastdeploy/vision/common/processors/base.cc
index 4e26297020..f7831ae638 100644
--- a/fastdeploy/vision/common/processors/base.cc
+++ b/fastdeploy/vision/common/processors/base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/common/processors/base.h"
+
 #include "fastdeploy/utils/utils.h"
 
 namespace fastdeploy {
@@ -21,27 +22,18 @@ namespace vision {
 ProcLib Processor::default_lib = ProcLib::DEFAULT;
 
 bool Processor::operator()(Mat* mat, ProcLib lib) {
-  // if default_lib is set
-  // then use default_lib
   ProcLib target = lib;
-  if (default_lib != ProcLib::DEFAULT) {
+  if (lib == ProcLib::DEFAULT) {
     target = default_lib;
   }
-
   if (target == ProcLib::FLYCV) {
 #ifdef ENABLE_FLYCV
-    if (mat->mat_type != ProcLib::FLYCV) {
-      if (mat->layout != Layout::HWC) {
-        FDERROR << "Cannot convert cv::Mat to fcv::Mat while layout is not HWC." << std::endl;
-      }
-      fcv::Mat fcv_mat = ConvertOpenCVMatToFalconCV(*(mat->GetOpenCVMat()));
-      mat->SetMat(fcv_mat);
-    }
-    return ImplByFalconCV(mat);
+    return ImplByFlyCV(mat);
 #else
-    FDASSERT(false, "FastDeploy didn't compile with FalconCV.");
+    FDASSERT(false, "FastDeploy didn't compile with FlyCV.");
 #endif
   }
+  // DEFAULT & OPENCV
   return ImplByOpenCV(mat);
 }
 
@@ -52,7 +44,7 @@ void EnableFlyCV() {
          << Processor::default_lib << std::endl;
 #else
   FDWARNING << "FastDeploy didn't compile with FlyCV, "
-                "will fallback to use OpenCV instead."
+               "will fallback to use OpenCV instead."
             << std::endl;
 #endif
 }
@@ -63,5 +55,100 @@ void DisableFlyCV() {
          << Processor::default_lib << std::endl;
 }
 
+cv::Mat CreateOpenCVMatFromTensor(const FDTensor& tensor) {
+  FDDataType type = tensor.dtype;
+  FDASSERT(tensor.shape.size() == 3,
+           "When create FD Mat from tensor, tensor shape should be 3-Dim, HWC "
+           "layout");
+  int64_t height = tensor.shape[0];
+  int64_t width = tensor.shape[1];
+  int64_t channel = tensor.shape[2];
+  cv::Mat ocv_mat;
+  // reference to outside FDTensor, zero copy
+  switch (type) {
+    case FDDataType::UINT8:
+      ocv_mat = cv::Mat(height, width, CV_8UC(channel),
+                        const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::INT8:
+      ocv_mat = cv::Mat(height, width, CV_8SC(channel),
+                        const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::INT16:
+      ocv_mat = cv::Mat(height, width, CV_16SC(channel),
+                        const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::INT32:
+      ocv_mat = cv::Mat(height, width, CV_32SC(channel),
+                        const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::FP32:
+      ocv_mat = cv::Mat(height, width, CV_32FC(channel),
+                        const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::FP64:
+      ocv_mat = cv::Mat(height, width, CV_64FC(channel),
+                        const_cast<void*>(tensor.Data()));
+      break;
+    default:
+      FDASSERT(false,
+               "Tensor type %d is not supported While calling "
+               "CreateFDMatFromTensor.",
+               type);
+      break;
+  }
+  return ocv_mat;
+}
+
+#ifdef ENABLE_FLYCV
+fcv::Mat CreateFlyCVMatFromTensor(const FDTensor& tensor) {
+  FDDataType type = tensor.dtype;
+  FDASSERT(tensor.shape.size() == 3,
+           "When create FD Mat from tensor, tensor shape should be 3-Dim, HWC "
+           "layout");
+  int64_t height = tensor.shape[0];
+  int64_t width = tensor.shape[1];
+  int64_t channel = tensor.shape[2];
+  fcv::Mat fcv_mat;
+  auto fcv_type = CreateFlyCVDataType(type, static_cast<int>(channel));
+  switch (type) {
+    case FDDataType::UINT8:
+      fcv_mat =
+        fcv::Mat(width, height, fcv_type, const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::FP32:
+      fcv_mat =
+        fcv::Mat(width, height, fcv_type, const_cast<void*>(tensor.Data()));
+      break;
+    case FDDataType::FP64:
+      fcv_mat =
+        fcv::Mat(width, height, fcv_type, const_cast<void*>(tensor.Data()));
+    break;
+    default:
+      FDASSERT(false,
+              "Tensor type %d is not supported While calling "
+              "CreateFDMatFromTensor.",
+               type);
+    break;
+  }
+  return fcv_mat;
+}
+#endif
+
+Mat CreateFDMatFromTensor(const FDTensor& tensor) {
+  if (Processor::default_lib == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+    fcv::Mat fcv_mat = CreateFlyCVMatFromTensor(tensor);
+    Mat mat = Mat(fcv_mat);
+    return mat;
+#else
+    FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
+#endif
+  }
+  cv::Mat ocv_mat = CreateOpenCVMatFromTensor(tensor);
+  Mat mat = Mat(ocv_mat);
+  return mat;
+}
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/base.h b/fastdeploy/vision/common/processors/base.h
index d985f348fc..bfd1e00856 100644
--- a/fastdeploy/vision/common/processors/base.h
+++ b/fastdeploy/vision/common/processors/base.h
@@ -22,7 +22,9 @@
 namespace fastdeploy {
 namespace vision {
 
-/*! @brief Enable using FlyCV to process image while deploy vision models. Currently, FlyCV in only available on ARM(Linux aarch64/Android), so will fallback to using OpenCV in other platform
+/*! @brief Enable using FlyCV to process image while deploy vision models.
+ * Currently, FlyCV in only available on ARM(Linux aarch64/Android), so will
+ * fallback to using OpenCV in other platform
  */
 FASTDEPLOY_DECL void EnableFlyCV();
 
@@ -41,16 +43,19 @@ class FASTDEPLOY_DECL Processor {
 
   virtual bool ImplByOpenCV(Mat* mat) = 0;
 
-  virtual bool ImplByFalconCV(Mat* mat) {
-    FDASSERT(false,
-             "%s is not implemented with FalconCV, please use OpenCV instead.",
-             Name().c_str());
-    return false;
+  virtual bool ImplByFlyCV(Mat* mat) {
+    return ImplByOpenCV(mat);
   }
 
-  virtual bool operator()(Mat* mat,
-                          ProcLib lib = ProcLib::OPENCV);
+  virtual bool operator()(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
 };
 
+// Create OpenCV/FlyCV/FD Mat from FD Tensor
+cv::Mat CreateOpenCVMatFromTensor(const FDTensor& tensor);
+#ifdef ENABLE_FLYCV
+fcv::Mat CreateFlyCVMatFromTensor(const FDTensor& tensor);
+#endif
+Mat CreateFDMatFromTensor(const FDTensor& tensor);
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/cast.cc b/fastdeploy/vision/common/processors/cast.cc
index 62aad3778f..0ca04a504e 100644
--- a/fastdeploy/vision/common/processors/cast.cc
+++ b/fastdeploy/vision/common/processors/cast.cc
@@ -36,8 +36,8 @@ bool Cast::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Cast::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool Cast::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   if (dtype_ == "float" && mat->Type() == FDDataType::FP32) {
     return true;
   }
@@ -46,18 +46,18 @@ bool Cast::ImplByFalconCV(Mat* mat) {
   }
   if (mat->layout != Layout::HWC) {
     FDERROR
-        << "While using Falcon to cast image, the image must be layout of HWC."
+        << "While using FlyCV to cast image, the image must be layout of HWC."
         << std::endl;
     return false;
   }
   if (dtype_ == "float") {
     fcv::Mat new_im;
-    auto fcv_type = CreateFalconCVDataType(FDDataType::FP32, im->channels());
+    auto fcv_type = CreateFlyCVDataType(FDDataType::FP32, im->channels());
     im->convert_to(new_im, fcv_type);
     mat->SetMat(new_im);
   } else if (dtype_ == "double") {
     fcv::Mat new_im;
-    auto fcv_type = CreateFalconCVDataType(FDDataType::FP64, im->channels());
+    auto fcv_type = CreateFlyCVDataType(FDDataType::FP64, im->channels());
     im->convert_to(new_im, fcv_type);
     mat->SetMat(new_im);
   } else {
diff --git a/fastdeploy/vision/common/processors/cast.h b/fastdeploy/vision/common/processors/cast.h
index a438fb9743..891ae334c8 100644
--- a/fastdeploy/vision/common/processors/cast.h
+++ b/fastdeploy/vision/common/processors/cast.h
@@ -24,15 +24,13 @@ class FASTDEPLOY_DECL Cast : public Processor {
   explicit Cast(const std::string& dtype = "float") : dtype_(dtype) {}
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "Cast"; }
   static bool Run(Mat* mat, const std::string& dtype,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
-  std::string GetDtype() const {
-    return dtype_;
-  }
+  std::string GetDtype() const { return dtype_; }
 
  private:
   std::string dtype_;
diff --git a/fastdeploy/vision/common/processors/center_crop.cc b/fastdeploy/vision/common/processors/center_crop.cc
index 6aca3acd09..af7c744482 100644
--- a/fastdeploy/vision/common/processors/center_crop.cc
+++ b/fastdeploy/vision/common/processors/center_crop.cc
@@ -36,8 +36,8 @@ bool CenterCrop::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool CenterCrop::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool CenterCrop::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   int height = static_cast<int>(im->height());
   int width = static_cast<int>(im->width());
   if (height < height_ || width < width_) {
@@ -62,5 +62,5 @@ bool CenterCrop::Run(Mat* mat, const int& width, const int& height,
   return c(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/center_crop.h b/fastdeploy/vision/common/processors/center_crop.h
index befeeb5a35..05f594249c 100644
--- a/fastdeploy/vision/common/processors/center_crop.h
+++ b/fastdeploy/vision/common/processors/center_crop.h
@@ -24,12 +24,12 @@ class FASTDEPLOY_DECL CenterCrop : public Processor {
   CenterCrop(int width, int height) : height_(height), width_(width) {}
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "CenterCrop"; }
 
   static bool Run(Mat* mat, const int& width, const int& height,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   int height_;
diff --git a/fastdeploy/vision/common/processors/color_space_convert.cc b/fastdeploy/vision/common/processors/color_space_convert.cc
index 15e8aa4265..4ccfb65100 100644
--- a/fastdeploy/vision/common/processors/color_space_convert.cc
+++ b/fastdeploy/vision/common/processors/color_space_convert.cc
@@ -25,10 +25,11 @@ bool BGR2RGB::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool BGR2RGB::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool BGR2RGB::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   if (im->channels() != 3) {
-    FDERROR << "[BGR2RGB] The channel of input image must be 3, but not it's " << im->channels() << "." << std::endl;
+    FDERROR << "[BGR2RGB] The channel of input image must be 3, but not it's "
+            << im->channels() << "." << std::endl;
     return false;
   }
   fcv::Mat new_im;
@@ -47,10 +48,11 @@ bool RGB2BGR::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool RGB2BGR::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool RGB2BGR::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   if (im->channels() != 3) {
-    FDERROR << "[RGB2BGR] The channel of input image must be 3, but not it's " << im->channels() << "." << std::endl;
+    FDERROR << "[RGB2BGR] The channel of input image must be 3, but not it's "
+            << im->channels() << "." << std::endl;
     return false;
   }
   fcv::Mat new_im;
diff --git a/fastdeploy/vision/common/processors/color_space_convert.h b/fastdeploy/vision/common/processors/color_space_convert.h
index ad66acd4f3..e090bc62d5 100644
--- a/fastdeploy/vision/common/processors/color_space_convert.h
+++ b/fastdeploy/vision/common/processors/color_space_convert.h
@@ -23,22 +23,22 @@ class FASTDEPLOY_DECL BGR2RGB : public Processor {
  public:
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   virtual std::string Name() { return "BGR2RGB"; }
 
-  static bool Run(Mat* mat, ProcLib lib = ProcLib::OPENCV);
+  static bool Run(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
 };
 
 class FASTDEPLOY_DECL RGB2BGR : public Processor {
  public:
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "RGB2BGR"; }
 
-  static bool Run(Mat* mat, ProcLib lib = ProcLib::OPENCV);
+  static bool Run(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/convert.cc b/fastdeploy/vision/common/processors/convert.cc
index 01c0673701..c50d2aa6d4 100644
--- a/fastdeploy/vision/common/processors/convert.cc
+++ b/fastdeploy/vision/common/processors/convert.cc
@@ -40,17 +40,18 @@ bool Convert::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Convert::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
-  FDASSERT(im->channels() == 3, "Only support 3-channels image in FalconCV.");
+bool Convert::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
+  FDASSERT(im->channels() == 3, "Only support 3-channels image in FlyCV.");
   std::vector<float> mean(3, 0);
   std::vector<float> std(3, 0);
   for (size_t i = 0; i < 3; ++i) {
-    std[i]  = 1.0 / alpha_[i];
+    std[i] = 1.0 / alpha_[i];
     mean[i] = -1 * beta_[i] * std[i];
   }
   fcv::Mat new_im;
-  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(), new_im, true);
+  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(),
+                                       new_im, true);
   mat->SetMat(new_im);
   return true;
 }
diff --git a/fastdeploy/vision/common/processors/convert.h b/fastdeploy/vision/common/processors/convert.h
index 197316a52c..6f26c45282 100644
--- a/fastdeploy/vision/common/processors/convert.h
+++ b/fastdeploy/vision/common/processors/convert.h
@@ -24,7 +24,7 @@ class FASTDEPLOY_DECL Convert : public Processor {
 
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "Convert"; }
 
@@ -32,7 +32,7 @@ class FASTDEPLOY_DECL Convert : public Processor {
   // The default behavior is the same as OpenCV's convertTo method.
   static bool Run(Mat* mat, const std::vector<float>& alpha,
                   const std::vector<float>& beta,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   std::vector<float> alpha_;
diff --git a/fastdeploy/vision/common/processors/crop.cc b/fastdeploy/vision/common/processors/crop.cc
index fc3a41ecba..ccb06f6539 100644
--- a/fastdeploy/vision/common/processors/crop.cc
+++ b/fastdeploy/vision/common/processors/crop.cc
@@ -37,8 +37,8 @@ bool Crop::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Crop::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool Crop::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   int height = static_cast<int>(im->height());
   int width = static_cast<int>(im->width());
   if (height < height_ + offset_h_ || width < width_ + offset_w_) {
diff --git a/fastdeploy/vision/common/processors/crop.h b/fastdeploy/vision/common/processors/crop.h
index 276d7d6cf6..369edb610f 100644
--- a/fastdeploy/vision/common/processors/crop.h
+++ b/fastdeploy/vision/common/processors/crop.h
@@ -31,12 +31,12 @@ class FASTDEPLOY_DECL Crop : public Processor {
   bool ImplByOpenCV(Mat* mat);
 
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "Crop"; }
 
   static bool Run(Mat* mat, int offset_w, int offset_h, int width, int height,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   int offset_w_;
diff --git a/fastdeploy/vision/common/processors/hwc2chw.cc b/fastdeploy/vision/common/processors/hwc2chw.cc
index 3af9b559d5..9db5c09ffd 100644
--- a/fastdeploy/vision/common/processors/hwc2chw.cc
+++ b/fastdeploy/vision/common/processors/hwc2chw.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/common/processors/hwc2chw.h"
+
 #include "fastdeploy/function/transpose.h"
 
 namespace fastdeploy {
@@ -41,18 +42,21 @@ bool HWC2CHW::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool HWC2CHW::ImplByFalconCV(Mat* mat) {
+bool HWC2CHW::ImplByFlyCV(Mat* mat) {
   if (mat->layout != Layout::HWC) {
-    FDERROR << "HWC2CHW: The input data is not Layout::HWC format!" << std::endl;
+    FDERROR << "HWC2CHW: The input data is not Layout::HWC format!"
+            << std::endl;
     return false;
   }
   if (mat->Type() != FDDataType::FP32) {
-    FDERROR << "HWC2CHW: Only support float data while use FalconCV, but now it's " << mat->Type() << "." << std::endl;
+    FDERROR << "HWC2CHW: Only support float data while use FlyCV, but now it's "
+            << mat->Type() << "." << std::endl;
     return false;
   }
-  fcv::Mat* im = mat->GetFalconCVMat();
+  fcv::Mat* im = mat->GetFlyCVMat();
   fcv::Mat new_im;
-  fcv::normalize_to_submean_to_reorder(*im, {0.0, 0.0, 0.0}, {1.0, 1.0, 1.0}, std::vector<uint32_t>(), new_im, false);
+  fcv::normalize_to_submean_to_reorder(*im, {0.0, 0.0, 0.0}, {1.0, 1.0, 1.0},
+                                       std::vector<uint32_t>(), new_im, false);
   mat->SetMat(new_im);
   mat->layout = Layout::CHW;
   return true;
@@ -64,5 +68,5 @@ bool HWC2CHW::Run(Mat* mat, ProcLib lib) {
   return h(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/hwc2chw.h b/fastdeploy/vision/common/processors/hwc2chw.h
index a21de7b4c3..535a1887b5 100644
--- a/fastdeploy/vision/common/processors/hwc2chw.h
+++ b/fastdeploy/vision/common/processors/hwc2chw.h
@@ -23,11 +23,11 @@ class FASTDEPLOY_DECL HWC2CHW : public Processor {
  public:
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "HWC2CHW"; }
 
-  static bool Run(Mat* mat, ProcLib lib = ProcLib::OPENCV);
+  static bool Run(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/letter_box.h b/fastdeploy/vision/common/processors/letter_box.h
old mode 100755
new mode 100644
index 5b99ada2ee..f69647e892
--- a/fastdeploy/vision/common/processors/letter_box.h
+++ b/fastdeploy/vision/common/processors/letter_box.h
@@ -23,8 +23,8 @@ class LetterBoxResize : public Processor {
  public:
   LetterBoxResize(const std::vector<int>& target_size,
                   const std::vector<float>& color) {
-      target_size_ = target_size;
-      color_ = color;
+    target_size_ = target_size;
+    color_ = color;
   }
 
   bool ImplByOpenCV(Mat* mat);
@@ -33,7 +33,7 @@ class LetterBoxResize : public Processor {
 
   static bool Run(Mat* mat, const std::vector<int>& target_size,
                   const std::vector<float>& color,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   std::vector<int> target_size_;
diff --git a/fastdeploy/vision/common/processors/limit_by_stride.cc b/fastdeploy/vision/common/processors/limit_by_stride.cc
index 85f3088b80..3efbc752f0 100644
--- a/fastdeploy/vision/common/processors/limit_by_stride.cc
+++ b/fastdeploy/vision/common/processors/limit_by_stride.cc
@@ -38,8 +38,8 @@ bool LimitByStride::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool LimitByStride::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool LimitByStride::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   int origin_w = im->width();
   int origin_h = im->height();
   int rw = origin_w - origin_w % stride_;
@@ -59,16 +59,14 @@ bool LimitByStride::ImplByFalconCV(Mat* mat) {
     } else if (interp_ == 2) {
       interp_method = fcv::InterpolationType::INTER_CUBIC;
     } else {
-      FDERROR << "LimitByStride: Only support interp_ be 0/1/2 with FalconCV, but "
+      FDERROR << "LimitByStride: Only support interp_ be 0/1/2 with FlyCV, but "
                  "now it's "
               << interp_ << "." << std::endl;
       return false;
     }
 
     fcv::Mat new_im;
-    FDERROR << "Before " << im->width() << " " << im->height() << std::endl;
     fcv::resize(*im, new_im, fcv::Size(rw, rh), 0, 0, interp_method);
-    FDERROR << "After " << new_im.width() << " " << new_im.height() << std::endl;
     mat->SetMat(new_im);
     mat->SetWidth(new_im.width());
     mat->SetHeight(new_im.height());
diff --git a/fastdeploy/vision/common/processors/limit_by_stride.h b/fastdeploy/vision/common/processors/limit_by_stride.h
index 465bc065d4..a3751df2e9 100644
--- a/fastdeploy/vision/common/processors/limit_by_stride.h
+++ b/fastdeploy/vision/common/processors/limit_by_stride.h
@@ -29,12 +29,12 @@ class FASTDEPLOY_DECL LimitByStride : public Processor {
   // Resize Mat* mat to make the size divisible by stride_.
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "LimitByStride"; }
 
   static bool Run(Mat* mat, int stride = 32, int interp = 1,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   int interp_;
diff --git a/fastdeploy/vision/common/processors/limit_long.cc b/fastdeploy/vision/common/processors/limit_long.cc
index 32eaad1923..7021f131b4 100644
--- a/fastdeploy/vision/common/processors/limit_long.cc
+++ b/fastdeploy/vision/common/processors/limit_long.cc
@@ -39,8 +39,8 @@ bool LimitLong::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool LimitLong::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool LimitLong::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   int origin_w = im->width();
   int origin_h = im->height();
   int im_size_max = std::max(origin_w, origin_h);
@@ -64,16 +64,13 @@ bool LimitLong::ImplByFalconCV(Mat* mat) {
     } else if (interp_ == 2) {
       interp_method = fcv::InterpolationType::INTER_CUBIC;
     } else {
-      FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FalconCV, but "
+      FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FlyCV, but "
                  "now it's "
               << interp_ << "." << std::endl;
       return false;
     }
     fcv::Mat new_im;
-    FDERROR << "origin " << im->width() << " " << im->height() << std::endl;
-    FDERROR << "scale " << scale << std::endl;
     fcv::resize(*im, new_im, fcv::Size(), scale, scale, interp_method);
-    FDERROR << "after " << new_im.width() << " " << new_im.height() << std::endl;
     mat->SetMat(new_im);
     mat->SetWidth(new_im.width());
     mat->SetHeight(new_im.height());
@@ -82,7 +79,8 @@ bool LimitLong::ImplByFalconCV(Mat* mat) {
 }
 #endif
 
-bool LimitLong::Run(Mat* mat, int max_long, int min_long, int interp, ProcLib lib) {
+bool LimitLong::Run(Mat* mat, int max_long, int min_long, int interp,
+                    ProcLib lib) {
   auto l = LimitLong(max_long, min_long, interp);
   return l(mat, lib);
 }
diff --git a/fastdeploy/vision/common/processors/limit_long.h b/fastdeploy/vision/common/processors/limit_long.h
index e21ddbf2f4..49055973d2 100644
--- a/fastdeploy/vision/common/processors/limit_long.h
+++ b/fastdeploy/vision/common/processors/limit_long.h
@@ -34,12 +34,12 @@ class FASTDEPLOY_DECL LimitLong : public Processor {
   // to min_long_, while scale the short edge proportionally.
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "LimitLong"; }
 
   static bool Run(Mat* mat, int max_long = -1, int min_long = -1,
-                  int interp = 1, ProcLib lib = ProcLib::OPENCV);
+                  int interp = 1, ProcLib lib = ProcLib::DEFAULT);
   int GetMaxLong() const { return max_long_; }
 
  private:
diff --git a/fastdeploy/vision/common/processors/limit_short.cc b/fastdeploy/vision/common/processors/limit_short.cc
index 56eb234f8b..d0f0697c8f 100644
--- a/fastdeploy/vision/common/processors/limit_short.cc
+++ b/fastdeploy/vision/common/processors/limit_short.cc
@@ -41,8 +41,8 @@ bool LimitShort::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool LimitShort::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool LimitShort::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   int origin_w = im->width();
   int origin_h = im->height();
   int im_size_min = std::min(origin_w, origin_h);
@@ -65,7 +65,7 @@ bool LimitShort::ImplByFalconCV(Mat* mat) {
     } else if (interp_ == 2) {
       interp_method = fcv::InterpolationType::INTER_CUBIC;
     } else {
-      FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FalconCV, but "
+      FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FlyCV, but "
                  "now it's "
               << interp_ << "." << std::endl;
       return false;
@@ -81,7 +81,8 @@ bool LimitShort::ImplByFalconCV(Mat* mat) {
 }
 #endif
 
-bool LimitShort::Run(Mat* mat, int max_short, int min_short, int interp, ProcLib lib) {
+bool LimitShort::Run(Mat* mat, int max_short, int min_short, int interp,
+                     ProcLib lib) {
   auto l = LimitShort(max_short, min_short, interp);
   return l(mat, lib);
 }
diff --git a/fastdeploy/vision/common/processors/limit_short.h b/fastdeploy/vision/common/processors/limit_short.h
index 8f6af34673..268fa088fe 100644
--- a/fastdeploy/vision/common/processors/limit_short.h
+++ b/fastdeploy/vision/common/processors/limit_short.h
@@ -34,12 +34,12 @@ class LimitShort : public Processor {
   // to min_short_, while scale the long edge proportionally.
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "LimitShort"; }
 
   static bool Run(Mat* mat, int max_short = -1, int min_short = -1,
-                  int interp = 1, ProcLib lib = ProcLib::OPENCV);
+                  int interp = 1, ProcLib lib = ProcLib::DEFAULT);
   int GetMaxShort() const { return max_short_; }
 
  private:
diff --git a/fastdeploy/vision/common/processors/mat.cc b/fastdeploy/vision/common/processors/mat.cc
index a906ff760a..db380ba740 100644
--- a/fastdeploy/vision/common/processors/mat.cc
+++ b/fastdeploy/vision/common/processors/mat.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/vision/common/processors/mat.h"
-#include "fastdeploy/vision/common/processors/utils.h"
+
 #include "fastdeploy/utils/utils.h"
+#include "fastdeploy/vision/common/processors/utils.h"
 
 namespace fastdeploy {
 namespace vision {
@@ -23,15 +24,16 @@ void* Mat::Data() {
 #ifdef ENABLE_FLYCV
     return fcv_mat.data();
 #else
-    FDASSERT(false, "FastDeploy didn't compile with FalconCV, but met data type with fcv::Mat.");
+    FDASSERT(false,
+             "FastDeploy didn't compile with FlyCV, but met data type with "
+             "fcv::Mat.");
 #endif
   }
   return cpu_mat.ptr();
 }
 
 void Mat::ShareWithTensor(FDTensor* tensor) {
-  tensor->SetExternalData({Channels(), Height(), Width()}, Type(),
-                          Data());
+  tensor->SetExternalData({Channels(), Height(), Width()}, Type(), Data());
   tensor->device = Device::CPU;
   if (layout == Layout::HWC) {
     tensor->shape = {Height(), Width(), Channels()};
@@ -56,26 +58,28 @@ void Mat::PrintInfo(const std::string& flag) {
 #ifdef ENABLE_FLYCV
     fcv::Scalar mean = fcv::mean(fcv_mat);
     std::cout << flag << ": "
-	      << "DataType=" << Type() << ", "
-	      << "Channel=" << Channels() << ", "
-	      << "Height=" << Height() << ", "
-	      << "Width=" << Width() << ", "
-	      << "Mean=";
+              << "DataType=" << Type() << ", "
+              << "Channel=" << Channels() << ", "
+              << "Height=" << Height() << ", "
+              << "Width=" << Width() << ", "
+              << "Mean=";
     for (int i = 0; i < Channels(); ++i) {
       std::cout << mean[i] << " ";
     }
     std::cout << std::endl;
 #else
-    FDASSERT(false, "FastDeploy didn't compile with FalconCV, but met data type with fcv::Mat.");
+    FDASSERT(false,
+             "FastDeploy didn't compile with FlyCV, but met data type with "
+             "fcv::Mat.");
 #endif
   } else {
     cv::Scalar mean = cv::mean(cpu_mat);
     std::cout << flag << ": "
-	      << "DataType=" << Type() << ", "
-	      << "Channel=" << Channels() << ", "
-	      << "Height=" << Height() << ", "
-	      << "Width=" << Width() << ", "
-	      << "Mean=";
+              << "DataType=" << Type() << ", "
+              << "Channel=" << Channels() << ", "
+              << "Height=" << Height() << ", "
+              << "Width=" << Width() << ", "
+              << "Mean=";
     for (int i = 0; i < Channels(); ++i) {
       std::cout << mean[i] << " ";
     }
@@ -84,87 +88,35 @@ void Mat::PrintInfo(const std::string& flag) {
 }
 
 FDDataType Mat::Type() {
-  int type = -1; 
+  int type = -1;
   if (mat_type == ProcLib::FLYCV) {
 #ifdef ENABLE_FLYCV
-    return FalconCVDataTypeToFD(fcv_mat.type());
+    return FlyCVDataTypeToFD(fcv_mat.type());
 #else
-    FDASSERT(false, "FastDeploy didn't compile with FalconCV, but met data type with fcv::Mat.");
+    FDASSERT(false,
+             "FastDeploy didn't compile with FlyCV, but met data type with "
+             "fcv::Mat.");
 #endif
   }
   return OpenCVDataTypeToFD(cpu_mat.type());
 }
 
-Mat CreateFromTensor(const FDTensor& tensor) {
-  int type = tensor.dtype;
-  cv::Mat temp_mat;
-  FDASSERT(tensor.shape.size() == 3,
-           "When create FD Mat from tensor, tensor shape should be 3-Dim, HWC "
-           "layout");
-  int64_t height = tensor.shape[0];
-  int64_t width = tensor.shape[1];
-  int64_t channel = tensor.shape[2];
-  switch (type) {
-    case FDDataType::UINT8:
-      temp_mat = cv::Mat(height, width, CV_8UC(channel),
-                         const_cast<void*>(tensor.Data()));
-      break;
-
-    case FDDataType::INT8:
-      temp_mat = cv::Mat(height, width, CV_8SC(channel),
-                         const_cast<void*>(tensor.Data()));
-      break;
-
-    case FDDataType::INT16:
-      temp_mat = cv::Mat(height, width, CV_16SC(channel),
-                         const_cast<void*>(tensor.Data()));
-      break;
-
-    case FDDataType::INT32:
-      temp_mat = cv::Mat(height, width, CV_32SC(channel),
-                         const_cast<void*>(tensor.Data()));
-      break;
-
-    case FDDataType::FP32:
-      temp_mat = cv::Mat(height, width, CV_32FC(channel),
-                         const_cast<void*>(tensor.Data()));
-      break;
-
-    case FDDataType::FP64:
-      temp_mat = cv::Mat(height, width, CV_64FC(channel),
-                         const_cast<void*>(tensor.Data()));
-      break;
-
-    default:
-      FDASSERT(
-          false,
-          "Tensor type %d is not supported While calling CreateFromTensor.",
-          type);
-      break;
-  }
-  Mat mat = Mat(temp_mat);
-  return mat;
-}
-
-std::ostream& operator<<(std::ostream& out,const ProcLib& p) {
+std::ostream& operator<<(std::ostream& out, const ProcLib& p) {
   switch (p) {
-  case ProcLib::DEFAULT:
-    out << "ProcLib::DEFAULT";
-    break;
-  case ProcLib::OPENCV:
-    out << "ProcLib::OPENCV";
-    break;
-  case ProcLib::FLYCV:
-    out << "ProcLib::FLYCV";
-    break;
-  default:
-    FDASSERT(false, "Unknow type of ProcLib.");
+    case ProcLib::DEFAULT:
+      out << "ProcLib::DEFAULT";
+      break;
+    case ProcLib::OPENCV:
+      out << "ProcLib::OPENCV";
+      break;
+    case ProcLib::FLYCV:
+      out << "ProcLib::FLYCV";
+      break;
+    default:
+      FDASSERT(false, "Unknow type of ProcLib.");
   }
   return out;
 }
 
-
-
-
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat.h b/fastdeploy/vision/common/processors/mat.h
index 20e56211db..995d497ccf 100644
--- a/fastdeploy/vision/common/processors/mat.h
+++ b/fastdeploy/vision/common/processors/mat.h
@@ -13,19 +13,19 @@
 // limitations under the License.
 #pragma once
 #include "fastdeploy/core/fd_tensor.h"
-#include "opencv2/core/core.hpp"
 #include "fastdeploy/vision/common/processors/utils.h"
+#include "opencv2/core/core.hpp"
 
 namespace fastdeploy {
 namespace vision {
 
-enum class FASTDEPLOY_DECL ProcLib { DEFAULT, OPENCV, FLYCV};
+enum class FASTDEPLOY_DECL ProcLib { DEFAULT, OPENCV, FLYCV };
 enum Layout { HWC, CHW };
 
 FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out, const ProcLib& p);
 
 struct FASTDEPLOY_DECL Mat {
-  explicit Mat(cv::Mat& mat) {
+  explicit Mat(const cv::Mat& mat) {
     cpu_mat = mat;
     layout = Layout::HWC;
     height = cpu_mat.rows;
@@ -34,7 +34,18 @@ struct FASTDEPLOY_DECL Mat {
     mat_type = ProcLib::OPENCV;
   }
 
-  // careful if you use this interface
+#ifdef ENABLE_FLYCV
+  explicit Mat(const fcv::Mat& mat) {
+    fcv_mat = mat;
+    layout = Layout::HWC;
+    height = fcv_mat.height();
+    width = fcv_mat.width();
+    channels = fcv_mat.channels();
+    mat_type = ProcLib::FLYCV;
+  }
+#endif
+
+  // Careful if you use this interface
   // this only used if you don't want to write
   // the original data, and write to a new cv::Mat
   // then replace the old cv::Mat of this structure
@@ -43,15 +54,23 @@ struct FASTDEPLOY_DECL Mat {
     mat_type = ProcLib::OPENCV;
   }
 
-  inline cv::Mat* GetOpenCVMat() {
-    FDASSERT(mat_type == ProcLib::OPENCV, "Met non cv::Mat data structure.");
-    return &cpu_mat;
-  }
-
-
-  inline const cv::Mat* GetOpenCVMat() const {
-    FDASSERT(mat_type == ProcLib::OPENCV, "Met non cv::Mat data structure.");
-    return &cpu_mat;
+  cv::Mat* GetOpenCVMat() {
+    if (mat_type == ProcLib::OPENCV) {
+      return &cpu_mat;
+    } else if (mat_type == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+      // Just a reference to fcv_mat, zero copy. After you
+      // call this method, cpu_mat and fcv_mat will point
+      // to the same memory buffer.
+      cpu_mat = ConvertFlyCVMatToOpenCV(fcv_mat);
+      mat_type = ProcLib::OPENCV;
+      return &cpu_mat;
+#else
+      FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
+#endif
+    } else {
+      FDASSERT(false, "The mat_type of custom Mat can not be ProcLib::DEFAULT");
+    }
   }
 
 #ifdef ENABLE_FLYCV
@@ -60,9 +79,19 @@ struct FASTDEPLOY_DECL Mat {
     mat_type = ProcLib::FLYCV;
   }
 
-  inline fcv::Mat* GetFalconCVMat() {
-    FDASSERT(mat_type == ProcLib::FLYCV, "Met non fcv::Mat data strucure.");
-    return &fcv_mat;
+  fcv::Mat* GetFlyCVMat() {
+    if (mat_type == ProcLib::FLYCV) {
+      return &fcv_mat;
+    } else if (mat_type == ProcLib::OPENCV) {
+      // Just a reference to cpu_mat, zero copy. After you
+      // call this method, fcv_mat and cpu_mat will point
+      // to the same memory buffer.
+      fcv_mat = ConvertOpenCVMatToFlyCV(cpu_mat);
+      mat_type = ProcLib::FLYCV;
+      return &fcv_mat;
+    } else {
+      FDASSERT(false, "The mat_type of custom Mat can not be ProcLib::DEFAULT");
+    }
   }
 #endif
 
@@ -73,17 +102,11 @@ struct FASTDEPLOY_DECL Mat {
   int height;
   int width;
   cv::Mat cpu_mat;
-
 #ifdef ENABLE_FLYCV
   fcv::Mat fcv_mat;
 #endif
 
  public:
-  template<typename T>
-  T* GetMat() {
-    return &cpu_mat;
-  }
-
   FDDataType Type();
   int Channels() const { return channels; }
   int Width() const { return width; }
@@ -97,18 +120,16 @@ struct FASTDEPLOY_DECL Mat {
   // Only support copy to cpu tensor now
   bool CopyToTensor(FDTensor* tensor);
 
-  // debug functions
-  // TODO(jiangjiajun) Develop a right process pipeline with c++ is not a easy
-  // things
-  // Will add more debug function here to help debug processed image
-  // This function will print shape / mean of each channels of the Mat
+  // Debug functions
+  // TODO(jiangjiajun) Develop a right process pipeline with c++
+  // is not a easy things, Will add more debug function here to
+  // help debug processed image. This function will print shape
+  // and mean of each channels of the Mat
   void PrintInfo(const std::string& flag);
 
   ProcLib mat_type = ProcLib::OPENCV;
   Layout layout = Layout::HWC;
 };
 
-Mat CreateFromTensor(const FDTensor& tensor);
-
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/normalize.cc b/fastdeploy/vision/common/processors/normalize.cc
index 88d68a65ee..cf4888613c 100644
--- a/fastdeploy/vision/common/processors/normalize.cc
+++ b/fastdeploy/vision/common/processors/normalize.cc
@@ -65,27 +65,29 @@ bool Normalize::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Normalize::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool Normalize::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   if (im->channels() != 3) {
-    FDERROR << "Only supports 3-channels image in FalconCV, but now it's " << im->channels() << "." << std::endl;
+    FDERROR << "Only supports 3-channels image in FlyCV, but now it's "
+            << im->channels() << "." << std::endl;
     return false;
   }
 
   std::vector<float> mean(3, 0);
   std::vector<float> std(3, 0);
   for (size_t i = 0; i < 3; ++i) {
-    std[i]  = 1.0 / alpha_[i];
+    std[i] = 1.0 / alpha_[i];
     mean[i] = -1 * beta_[i] * std[i];
   }
-  fcv::Mat new_im(im->width(), im->height(), fcv::FCVImageType::PACKAGE_BGR_F32);
-  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(), new_im, true);
+  fcv::Mat new_im(im->width(), im->height(),
+                  fcv::FCVImageType::PACKAGE_BGR_F32);
+  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(),
+                                       new_im, true);
   mat->SetMat(new_im);
   return true;
 }
 #endif
 
-
 bool Normalize::Run(Mat* mat, const std::vector<float>& mean,
                     const std::vector<float>& std, bool is_scale,
                     const std::vector<float>& min,
@@ -94,5 +96,5 @@ bool Normalize::Run(Mat* mat, const std::vector<float>& mean,
   return n(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/normalize.h b/fastdeploy/vision/common/processors/normalize.h
index 964c19fb2a..515fcd7e6c 100644
--- a/fastdeploy/vision/common/processors/normalize.h
+++ b/fastdeploy/vision/common/processors/normalize.h
@@ -26,7 +26,7 @@ class FASTDEPLOY_DECL Normalize : public Processor {
             const std::vector<float>& max = std::vector<float>());
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "Normalize"; }
 
@@ -44,14 +44,10 @@ class FASTDEPLOY_DECL Normalize : public Processor {
                   const std::vector<float>& std, bool is_scale = true,
                   const std::vector<float>& min = std::vector<float>(),
                   const std::vector<float>& max = std::vector<float>(),
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
-  std::vector<float> GetAlpha() const {
-    return alpha_;
-  }
-  std::vector<float> GetBeta() const {
-    return beta_;
-  }
+  std::vector<float> GetAlpha() const { return alpha_; }
+  std::vector<float> GetBeta() const { return beta_; }
 
  private:
   std::vector<float> alpha_;
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.cc b/fastdeploy/vision/common/processors/normalize_and_permute.cc
index 64daf07837..cb78cc7205 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cc
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cc
@@ -17,11 +17,11 @@
 namespace fastdeploy {
 namespace vision {
 
-
 NormalizeAndPermute::NormalizeAndPermute(const std::vector<float>& mean,
-                     const std::vector<float>& std, bool is_scale,
-                     const std::vector<float>& min,
-                     const std::vector<float>& max) {
+                                         const std::vector<float>& std,
+                                         bool is_scale,
+                                         const std::vector<float>& min,
+                                         const std::vector<float>& max) {
   FDASSERT(mean.size() == std.size(),
            "Normalize: requires the size of mean equal to the size of std.");
   std::vector<double> mean_(mean.begin(), mean.end());
@@ -65,7 +65,10 @@ bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
   }
   cv::Mat res(origin_h, origin_w, CV_32FC(im->channels()));
   for (int i = 0; i < im->channels(); ++i) {
-    cv::extractChannel(split_im[i], cv::Mat(origin_h, origin_w, CV_32FC1, res.ptr() + i * origin_h * origin_w * 4), 0);
+    cv::extractChannel(split_im[i],
+                       cv::Mat(origin_h, origin_w, CV_32FC1,
+                               res.ptr() + i * origin_h * origin_w * 4),
+                       0);
   }
 
   mat->SetMat(res);
@@ -74,38 +77,39 @@ bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool NormalizeAndPermute::ImplByFalconCV(Mat* mat) {
+bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "Only supports input with HWC layout." << std::endl;
     return false;
   }
-  fcv::Mat* im = mat->GetFalconCVMat();
+  fcv::Mat* im = mat->GetFlyCVMat();
   if (im->channels() != 3) {
-    FDERROR << "Only supports 3-channels image in FalconCV, but now it's " << im->channels() << "." << std::endl;
+    FDERROR << "Only supports 3-channels image in FlyCV, but now it's "
+            << im->channels() << "." << std::endl;
     return false;
   }
   std::vector<float> mean(3, 0);
   std::vector<float> std(3, 0);
   for (size_t i = 0; i < 3; ++i) {
-    std[i]  = 1.0 / alpha_[i];
+    std[i] = 1.0 / alpha_[i];
     mean[i] = -1 * beta_[i] * std[i];
   }
   fcv::Mat new_im;
-  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(), new_im, false);
+  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(),
+                                       new_im, false);
   mat->SetMat(new_im);
   mat->layout = Layout::CHW;
   return true;
 }
 #endif
 
-
 bool NormalizeAndPermute::Run(Mat* mat, const std::vector<float>& mean,
-                    const std::vector<float>& std, bool is_scale,
-                    const std::vector<float>& min,
-                    const std::vector<float>& max, ProcLib lib) {
+                              const std::vector<float>& std, bool is_scale,
+                              const std::vector<float>& min,
+                              const std::vector<float>& max, ProcLib lib) {
   auto n = NormalizeAndPermute(mean, std, is_scale, min, max);
   return n(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.h b/fastdeploy/vision/common/processors/normalize_and_permute.h
index 4a154f71f8..ec47665262 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.h
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.h
@@ -21,13 +21,12 @@ namespace vision {
 class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
  public:
   NormalizeAndPermute(const std::vector<float>& mean,
-            const std::vector<float>& std,
-            bool is_scale = true,
-            const std::vector<float>& min = std::vector<float>(),
-            const std::vector<float>& max = std::vector<float>());
+                      const std::vector<float>& std, bool is_scale = true,
+                      const std::vector<float>& min = std::vector<float>(),
+                      const std::vector<float>& max = std::vector<float>());
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "NormalizeAndPermute"; }
 
@@ -45,7 +44,7 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
                   const std::vector<float>& std, bool is_scale = true,
                   const std::vector<float>& min = std::vector<float>(),
                   const std::vector<float>& max = std::vector<float>(),
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
   void SetAlpha(const std::vector<float>& alpha) {
     alpha_.clear();
diff --git a/fastdeploy/vision/common/processors/pad.cc b/fastdeploy/vision/common/processors/pad.cc
index 7352356d02..5068b16fc1 100644
--- a/fastdeploy/vision/common/processors/pad.cc
+++ b/fastdeploy/vision/common/processors/pad.cc
@@ -53,7 +53,7 @@ bool Pad::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Pad::ImplByFalconCV(Mat* mat) {
+bool Pad::ImplByFlyCV(Mat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "Pad: The input data must be Layout::HWC format!" << std::endl;
     return false;
@@ -70,7 +70,7 @@ bool Pad::ImplByFalconCV(Mat* mat) {
             << std::endl;
     return false;
   }
-  fcv::Mat* im = mat->GetFalconCVMat();
+  fcv::Mat* im = mat->GetFlyCVMat();
   fcv::Scalar value;
   if (value_.size() == 1) {
     value = fcv::Scalar(value_[0]);
@@ -83,7 +83,7 @@ bool Pad::ImplByFalconCV(Mat* mat) {
   }
   fcv::Mat new_im;
   fcv::copy_make_border(*im, new_im, top_, bottom_, left_, right_,
-                    fcv::BorderTypes::BORDER_CONSTANT, value);
+                        fcv::BorderTypes::BORDER_CONSTANT, value);
   mat->SetMat(new_im);
   mat->SetHeight(new_im.height());
   mat->SetWidth(new_im.width());
@@ -92,11 +92,10 @@ bool Pad::ImplByFalconCV(Mat* mat) {
 #endif
 
 bool Pad::Run(Mat* mat, const int& top, const int& bottom, const int& left,
-              const int& right, const std::vector<float>& value,
-              ProcLib lib) {
+              const int& right, const std::vector<float>& value, ProcLib lib) {
   auto p = Pad(top, bottom, left, right, value);
   return p(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/pad.h b/fastdeploy/vision/common/processors/pad.h
index 5405564ec4..661632e77c 100644
--- a/fastdeploy/vision/common/processors/pad.h
+++ b/fastdeploy/vision/common/processors/pad.h
@@ -31,13 +31,13 @@ class FASTDEPLOY_DECL Pad : public Processor {
   }
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "Pad"; }
 
   static bool Run(Mat* mat, const int& top, const int& bottom, const int& left,
                   const int& right, const std::vector<float>& value,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   int top_;
diff --git a/fastdeploy/vision/common/processors/pad_to_size.cc b/fastdeploy/vision/common/processors/pad_to_size.cc
index 5956953e04..3480345091 100644
--- a/fastdeploy/vision/common/processors/pad_to_size.cc
+++ b/fastdeploy/vision/common/processors/pad_to_size.cc
@@ -76,7 +76,7 @@ bool PadToSize::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool PadToSize::ImplByFalconCV(Mat* mat) {
+bool PadToSize::ImplByFlyCV(Mat* mat) {
   if (width_ == -1 || height_ == -1) {
     return true;
   }
@@ -115,7 +115,7 @@ bool PadToSize::ImplByFalconCV(Mat* mat) {
     return true;
   }
 
-  fcv::Mat* im = mat->GetFalconCVMat();
+  fcv::Mat* im = mat->GetFlyCVMat();
   fcv::Scalar value;
   if (value_.size() == 1) {
     value = fcv::Scalar(value_[0]);
@@ -128,8 +128,9 @@ bool PadToSize::ImplByFalconCV(Mat* mat) {
   }
   fcv::Mat new_im;
   // top, bottom, left, right
-  fcv::copy_make_border(*im, new_im, 0, height_ - origin_h, 0, width_ - origin_w,
-                    fcv::BorderTypes::BORDER_CONSTANT, value);
+  fcv::copy_make_border(*im, new_im, 0, height_ - origin_h, 0,
+                        width_ - origin_w, fcv::BorderTypes::BORDER_CONSTANT,
+                        value);
   mat->SetMat(new_im);
   mat->SetHeight(height_);
   mat->SetWidth(width_);
diff --git a/fastdeploy/vision/common/processors/pad_to_size.h b/fastdeploy/vision/common/processors/pad_to_size.h
index dff8110891..c73cee3c24 100644
--- a/fastdeploy/vision/common/processors/pad_to_size.h
+++ b/fastdeploy/vision/common/processors/pad_to_size.h
@@ -29,13 +29,13 @@ class FASTDEPLOY_DECL PadToSize : public Processor {
   }
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "PadToSize"; }
 
   static bool Run(Mat* mat, int width, int height,
                   const std::vector<float>& value,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   int width_;
diff --git a/fastdeploy/vision/common/processors/resize.cc b/fastdeploy/vision/common/processors/resize.cc
index eb1932abc0..28488c2cd4 100644
--- a/fastdeploy/vision/common/processors/resize.cc
+++ b/fastdeploy/vision/common/processors/resize.cc
@@ -55,12 +55,12 @@ bool Resize::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool Resize::ImplByFalconCV(Mat* mat) {
+bool Resize::ImplByFlyCV(Mat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "Resize: The format of input is not HWC." << std::endl;
     return false;
   }
-  fcv::Mat* im = mat->GetFalconCVMat();
+  fcv::Mat* im = mat->GetFlyCVMat();
   int origin_w = im->width();
   int origin_h = im->height();
 
@@ -79,7 +79,7 @@ bool Resize::ImplByFalconCV(Mat* mat) {
   } else if (interp_ == 2) {
     interp_method = fcv::InterpolationType::INTER_CUBIC;
   } else {
-    FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FalconCV, but "
+    FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FlyCV, but "
                "now it's "
             << interp_ << "." << std::endl;
     return false;
@@ -99,7 +99,8 @@ bool Resize::ImplByFalconCV(Mat* mat) {
     mat->SetHeight(new_im.height());
   } else if (scale_w_ > 0 && scale_h_ > 0) {
     fcv::Mat new_im;
-    fcv::resize(*im, new_im, fcv::Size(0, 0), scale_w_, scale_h_, interp_method);
+    fcv::resize(*im, new_im, fcv::Size(0, 0), scale_w_, scale_h_,
+                interp_method);
     mat->SetMat(new_im);
     mat->SetWidth(new_im.width());
     mat->SetHeight(new_im.height());
@@ -122,5 +123,5 @@ bool Resize::Run(Mat* mat, int width, int height, float scale_w, float scale_h,
   return r(mat, lib);
 }
 
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/resize.h b/fastdeploy/vision/common/processors/resize.h
index b20f0e468c..e6a4ba1b05 100644
--- a/fastdeploy/vision/common/processors/resize.h
+++ b/fastdeploy/vision/common/processors/resize.h
@@ -33,13 +33,13 @@ class FASTDEPLOY_DECL Resize : public Processor {
 
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "Resize"; }
 
   static bool Run(Mat* mat, int width, int height, float scale_w = -1.0,
                   float scale_h = -1.0, int interp = 1, bool use_scale = false,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
   bool SetWidthAndHeight(int width, int height) {
     width_ = width;
diff --git a/fastdeploy/vision/common/processors/resize_by_long.h b/fastdeploy/vision/common/processors/resize_by_long.h
index 033bc9d5ba..c288e07a43 100644
--- a/fastdeploy/vision/common/processors/resize_by_long.h
+++ b/fastdeploy/vision/common/processors/resize_by_long.h
@@ -33,7 +33,7 @@ class ResizeByLong : public Processor {
 
   static bool Run(Mat* mat, int target_size, int interp = 1,
                   bool use_scale = true, int max_size = -1,
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   double GenerateScale(const int origin_w, const int origin_h);
diff --git a/fastdeploy/vision/common/processors/resize_by_short.cc b/fastdeploy/vision/common/processors/resize_by_short.cc
index 2fe60b9ee8..5616961f1c 100644
--- a/fastdeploy/vision/common/processors/resize_by_short.cc
+++ b/fastdeploy/vision/common/processors/resize_by_short.cc
@@ -37,8 +37,8 @@ bool ResizeByShort::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool ResizeByShort::ImplByFalconCV(Mat* mat) {
-  fcv::Mat* im = mat->GetFalconCVMat();
+bool ResizeByShort::ImplByFlyCV(Mat* mat) {
+  fcv::Mat* im = mat->GetFlyCVMat();
   int origin_w = im->width();
   int origin_h = im->height();
   double scale = GenerateScale(origin_w, origin_h);
@@ -51,7 +51,7 @@ bool ResizeByShort::ImplByFalconCV(Mat* mat) {
   } else if (interp_ == 2) {
     interp_method = fcv::InterpolationType::INTER_CUBIC;
   } else {
-    FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FalconCV, but "
+    FDERROR << "LimitLong: Only support interp_ be 0/1/2 with FlyCV, but "
                "now it's "
             << interp_ << "." << std::endl;
     return false;
diff --git a/fastdeploy/vision/common/processors/resize_by_short.h b/fastdeploy/vision/common/processors/resize_by_short.h
index 2ea901ee39..151605bebd 100644
--- a/fastdeploy/vision/common/processors/resize_by_short.h
+++ b/fastdeploy/vision/common/processors/resize_by_short.h
@@ -30,14 +30,14 @@ class FASTDEPLOY_DECL ResizeByShort : public Processor {
   }
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "ResizeByShort"; }
 
   static bool Run(Mat* mat, int target_size, int interp = 1,
                   bool use_scale = true,
                   const std::vector<int>& max_hw = std::vector<int>(),
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   double GenerateScale(const int origin_w, const int origin_h);
diff --git a/fastdeploy/vision/common/processors/stride_pad.cc b/fastdeploy/vision/common/processors/stride_pad.cc
index 4b71e34fbe..062b4f22c1 100644
--- a/fastdeploy/vision/common/processors/stride_pad.cc
+++ b/fastdeploy/vision/common/processors/stride_pad.cc
@@ -64,7 +64,7 @@ bool StridePad::ImplByOpenCV(Mat* mat) {
 }
 
 #ifdef ENABLE_FLYCV
-bool StridePad::ImplByFalconCV(Mat* mat) {
+bool StridePad::ImplByFlyCV(Mat* mat) {
   if (mat->layout != Layout::HWC) {
     FDERROR << "StridePad: The input data must be Layout::HWC format!"
             << std::endl;
@@ -92,7 +92,7 @@ bool StridePad::ImplByFalconCV(Mat* mat) {
   if (pad_h == 0 && pad_w == 0) {
     return true;
   }
-  fcv::Mat* im = mat->GetFalconCVMat();
+  fcv::Mat* im = mat->GetFlyCVMat();
   fcv::Scalar value;
   if (value_.size() == 1) {
     value = fcv::Scalar(value_[0]);
@@ -105,7 +105,8 @@ bool StridePad::ImplByFalconCV(Mat* mat) {
   }
   fcv::Mat new_im;
   // top, bottom, left, right
-  fcv::copy_make_border(*im, new_im, 0, pad_h, 0, pad_w, fcv::BorderTypes::BORDER_CONSTANT, value);
+  fcv::copy_make_border(*im, new_im, 0, pad_h, 0, pad_w,
+                        fcv::BorderTypes::BORDER_CONSTANT, value);
   mat->SetMat(new_im);
   mat->SetHeight(new_im.height());
   mat->SetWidth(new_im.width());
diff --git a/fastdeploy/vision/common/processors/stride_pad.h b/fastdeploy/vision/common/processors/stride_pad.h
index 67fe692d48..18eebd54e1 100644
--- a/fastdeploy/vision/common/processors/stride_pad.h
+++ b/fastdeploy/vision/common/processors/stride_pad.h
@@ -28,13 +28,13 @@ class FASTDEPLOY_DECL StridePad : public Processor {
   }
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
-  bool ImplByFalconCV(Mat* mat);
+  bool ImplByFlyCV(Mat* mat);
 #endif
   std::string Name() { return "StridePad"; }
 
   static bool Run(Mat* mat, int stride,
                   const std::vector<float>& value = std::vector<float>(),
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   int stride_ = 32;
diff --git a/fastdeploy/vision/common/processors/transform.h b/fastdeploy/vision/common/processors/transform.h
index 8d1b5010d0..7952caca34 100644
--- a/fastdeploy/vision/common/processors/transform.h
+++ b/fastdeploy/vision/common/processors/transform.h
@@ -20,59 +20,64 @@
 #include "fastdeploy/vision/common/processors/convert.h"
 #include "fastdeploy/vision/common/processors/crop.h"
 #include "fastdeploy/vision/common/processors/hwc2chw.h"
+#include "fastdeploy/vision/common/processors/letter_box.h"
 #include "fastdeploy/vision/common/processors/limit_by_stride.h"
 #include "fastdeploy/vision/common/processors/limit_long.h"
 #include "fastdeploy/vision/common/processors/limit_short.h"
 #include "fastdeploy/vision/common/processors/normalize.h"
+#include "fastdeploy/vision/common/processors/normalize_and_permute.h"
 #include "fastdeploy/vision/common/processors/pad.h"
 #include "fastdeploy/vision/common/processors/pad_to_size.h"
 #include "fastdeploy/vision/common/processors/resize.h"
 #include "fastdeploy/vision/common/processors/resize_by_long.h"
 #include "fastdeploy/vision/common/processors/resize_by_short.h"
 #include "fastdeploy/vision/common/processors/stride_pad.h"
-#include "fastdeploy/vision/common/processors/normalize_and_permute.h"
 #include "fastdeploy/vision/common/processors/warp_affine.h"
-#include "fastdeploy/vision/common/processors/letter_box.h"
 
 namespace fastdeploy {
 namespace vision {
 
-inline void FuseNormalizeCast(std::vector<std::shared_ptr<Processor>>* processors) {
+inline void FuseNormalizeCast(
+    std::vector<std::shared_ptr<Processor>>* processors) {
   // Fuse Normalize and Cast<Float>
   int cast_index = -1;
   for (size_t i = 0; i < processors->size(); ++i) {
     if ((*processors)[i]->Name() == "Cast") {
-       if (i == 0) {
-	 continue;
-       }
-       if ((*processors)[i - 1]->Name() != "Normalize" && (*processors)[i - 1]->Name() != "NormalizeAndPermute") {
-	 continue;
-       }
-       cast_index = i;
+      if (i == 0) {
+        continue;
+      }
+      if ((*processors)[i - 1]->Name() != "Normalize" &&
+          (*processors)[i - 1]->Name() != "NormalizeAndPermute") {
+        continue;
+      }
+      cast_index = i;
     }
   }
   if (cast_index < 0) {
     return;
   }
 
-  std::cout << dynamic_cast<Cast*>((*processors)[cast_index].get())->GetDtype() << "-----" << std::endl;
-  if (dynamic_cast<Cast*>((*processors)[cast_index].get())->GetDtype() != "float") {
+  if (dynamic_cast<Cast*>((*processors)[cast_index].get())->GetDtype() !=
+      "float") {
     return;
   }
   processors->erase(processors->begin() + cast_index);
-  FDINFO << (*processors)[cast_index - 1]->Name() << " and Cast are fused to " << (*processors)[cast_index - 1]->Name() << " in preprocessing pipeline." << std::endl;
+  FDINFO << (*processors)[cast_index - 1]->Name() << " and Cast are fused to "
+         << (*processors)[cast_index - 1]->Name()
+         << " in preprocessing pipeline." << std::endl;
 }
 
-inline void FuseNormalizeHWC2CHW(std::vector<std::shared_ptr<Processor>>* processors) {
+inline void FuseNormalizeHWC2CHW(
+    std::vector<std::shared_ptr<Processor>>* processors) {
   // Fuse Normalize and HWC2CHW to NormalizeAndPermute
   int hwc2chw_index = -1;
   for (size_t i = 0; i < processors->size(); ++i) {
     if ((*processors)[i]->Name() == "HWC2CHW") {
       if (i == 0) {
-	    continue;
+        continue;
       }
       if ((*processors)[i - 1]->Name() != "Normalize") {
-	    continue;
+        continue;
       }
       hwc2chw_index = i;
     }
@@ -83,10 +88,12 @@ inline void FuseNormalizeHWC2CHW(std::vector<std::shared_ptr<Processor>>* proces
   }
 
   // Get alpha and beta of Normalize
-  std::vector<float> alpha = dynamic_cast<Normalize*>(
-                    (*processors)[hwc2chw_index - 1].get())->GetAlpha();
-  std::vector<float> beta = dynamic_cast<Normalize*>(
-                    (*processors)[hwc2chw_index - 1].get())->GetBeta();
+  std::vector<float> alpha =
+      dynamic_cast<Normalize*>((*processors)[hwc2chw_index - 1].get())
+          ->GetAlpha();
+  std::vector<float> beta =
+      dynamic_cast<Normalize*>((*processors)[hwc2chw_index - 1].get())
+          ->GetBeta();
 
   // Delete Normalize and HWC2CHW
   processors->erase(processors->begin() + hwc2chw_index);
@@ -99,12 +106,13 @@ inline void FuseNormalizeHWC2CHW(std::vector<std::shared_ptr<Processor>>* proces
 
   // Set alpha and beta
   auto processor = dynamic_cast<NormalizeAndPermute*>(
-                (*processors)[hwc2chw_index - 1].get());
+      (*processors)[hwc2chw_index - 1].get());
 
   processor->SetAlpha(alpha);
   processor->SetBeta(beta);
   FDINFO << "Normalize and HWC2CHW are fused to NormalizeAndPermute "
-	 " in preprocessing pipeline." << std::endl;
+            " in preprocessing pipeline."
+         << std::endl;
 }
 
 inline void FuseTransforms(
diff --git a/fastdeploy/vision/common/processors/utils.cc b/fastdeploy/vision/common/processors/utils.cc
index 4e5373ef20..ec3eba4727 100644
--- a/fastdeploy/vision/common/processors/utils.cc
+++ b/fastdeploy/vision/common/processors/utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/utils/utils.h"
+
 #include "fastdeploy/vision/common/processors/utils.h"
 
 namespace fastdeploy {
@@ -37,12 +38,15 @@ FDDataType OpenCVDataTypeToFD(int type) {
   } else if (type == 6) {
     return FDDataType::FP64;
   } else {
-    FDASSERT(false, "While calling OpenCVDataTypeToFD(), get type = %d, which is not expected.", type);
+    FDASSERT(false,
+             "While calling OpenCVDataTypeToFD(), get type = %d, which is not "
+             "expected.",
+             type);
   }
 }
 
 #ifdef ENABLE_FLYCV
-FDDataType FalconCVDataTypeToFD(fcv::FCVImageType type) {
+FDDataType FlyCVDataTypeToFD(fcv::FCVImageType type) {
   if (type == fcv::FCVImageType::GRAY_U8) {
     return FDDataType::UINT8;
   } else if (type == fcv::FCVImageType::PACKAGE_BGR_U8) {
@@ -104,13 +108,14 @@ FDDataType FalconCVDataTypeToFD(fcv::FCVImageType type) {
   } else if (type == fcv::FCVImageType::GRAY_F64) {
     return FDDataType::FP64;
   }
-  FDASSERT(false, "While calling FalconDataTypeToFD(), get unexpected type:%d.", int(type));
+  FDASSERT(false, "While calling FlyCVDataTypeToFD(), get unexpected type:%d.",
+           int(type));
   return FDDataType::UNKNOWN1;
 }
 
-fcv::FCVImageType CreateFalconCVDataType(FDDataType type, int channel) {
+fcv::FCVImageType CreateFlyCVDataType(FDDataType type, int channel) {
   FDASSERT(channel == 1 || channel == 3 || channel == 4,
-           "Only support channel be 1/3/4 in Falcon.");
+           "Only support channel be 1/3/4 in FlyCV.");
   if (type == FDDataType::UINT8) {
     if (channel == 1) {
       return fcv::FCVImageType::GRAY_U8;
@@ -132,18 +137,54 @@ fcv::FCVImageType CreateFalconCVDataType(FDDataType type, int channel) {
   return fcv::FCVImageType::PACKAGE_BGR_F32;
 }
 
-fcv::Mat ConvertOpenCVMatToFalconCV(cv::Mat& im) {
+int CreateOpenCVDataType(FDDataType type, int channel) {
+  FDASSERT(channel == 1 || channel == 3 || channel == 4,
+           "Only support channel be 1/3/4 in OpenCV.");
+  if (type == FDDataType::UINT8) {
+    if (channel == 1) {
+      return CV_8UC1;
+    } else if (channel == 3) {
+      return CV_8UC3;
+    } else {
+      return CV_8UC4;
+    }
+  } else if (type == FDDataType::FP32) {
+    if (channel == 1) {
+      return CV_32FC1;
+    } else if (channel == 3) {
+      return CV_32FC3;
+    } else {
+      return CV_32FC4;
+    }
+  }
+  FDASSERT(false, "Data type of %s is not supported.", Str(type).c_str());
+  return CV_32FC3;
+}
+
+fcv::Mat ConvertOpenCVMatToFlyCV(cv::Mat& im) {
   int type = im.type() % 8;
   // 0: uint8; 5: float32; 6: float64
   if (type != 0 && type != 5 && type != 6) {
-    FDASSERT(false, "Only support type of uint8/float/double, but now it's %d.", im.type());
+    FDASSERT(false, "Only support type of uint8/float/double, but now it's %d.",
+             im.type());
   }
-  auto fcv_type = CreateFalconCVDataType(OpenCVDataTypeToFD(im.type()), im.channels());
-  return fcv::Mat(im.cols, im.rows, fcv_type, im.ptr());
+  auto fcv_type =
+      CreateFlyCVDataType(OpenCVDataTypeToFD(im.type()), im.channels());
+  return fcv::Mat(im.cols, im.rows, fcv_type, im.ptr());  // reference only
+}
+
+cv::Mat ConvertFlyCVMatToOpenCV(fcv::Mat& fim) {
+  auto fd_dtype = FlyCVDataTypeToFD(fim.type());
+  if (fd_dtype != FDDataType::UINT8 && fd_dtype != FDDataType::FP32 &&
+      fd_dtype != FDDataType::FP64) {
+    FDASSERT(false, "Only support type of uint8/float/double, but now it's %s.",
+             Str(fd_dtype).c_str());
+  }
+  auto ocv_type = CreateOpenCVDataType(fd_dtype, fim.channels());
+  return cv::Mat(fim.height(), fim.width(), ocv_type,
+                 fim.data());  // reference only
 }
 #endif
 
-
-
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/utils.h b/fastdeploy/vision/common/processors/utils.h
index 57c1309e7f..3b3cfc40dc 100644
--- a/fastdeploy/vision/common/processors/utils.h
+++ b/fastdeploy/vision/common/processors/utils.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
+#include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/utils/utils.h"
 #include "opencv2/core/core.hpp"
-#include "fastdeploy/core/fd_tensor.h"
 
 #ifdef ENABLE_FLYCV
-#include "flycv.h" // NOLINT
+#include "flycv.h"  // NOLINT
 #endif
 
 namespace fastdeploy {
@@ -27,14 +27,18 @@ namespace vision {
 
 // Convert data type of opencv to FDDataType
 FDDataType OpenCVDataTypeToFD(int type);
+// Create data type of opencv by FDDataType
+int CreateOpenCVDataType(FDDataType type, int channel = 1);
 
 #ifdef ENABLE_FLYCV
 // Convert data type of flycv to FDDataType
-FDDataType FalconCVDataTypeToFD(fcv::FCVImageType type);
+FDDataType FlyCVDataTypeToFD(fcv::FCVImageType type);
 // Create data type of flycv by FDDataType
-fcv::FCVImageType CreateFalconCVDataType(FDDataType type, int channel = 1);
+fcv::FCVImageType CreateFlyCVDataType(FDDataType type, int channel = 1);
 // Convert cv::Mat to fcv::Mat
-fcv::Mat ConvertOpenCVMatToFalconCV(cv::Mat& im);
+fcv::Mat ConvertOpenCVMatToFlyCV(cv::Mat& im);
+// Convert fcv::Mat to fcv::mat
+cv::Mat ConvertFlyCVMatToOpenCV(fcv::Mat& fim);
 #endif
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/warp_affine.h b/fastdeploy/vision/common/processors/warp_affine.h
index 5548d495fc..4349ee206b 100644
--- a/fastdeploy/vision/common/processors/warp_affine.h
+++ b/fastdeploy/vision/common/processors/warp_affine.h
@@ -21,9 +21,7 @@ namespace vision {
 
 class WarpAffine : public Processor {
  public:
-  WarpAffine(const cv::Mat& trans_matrix,
-             int width, int height,
-             int interp = 1,
+  WarpAffine(const cv::Mat& trans_matrix, int width, int height, int interp = 1,
              int border_mode = 0,
              const cv::Scalar& borderValue = cv::Scalar()) {
     trans_matrix_ = trans_matrix;
@@ -37,7 +35,7 @@ class WarpAffine : public Processor {
   bool ImplByOpenCV(Mat* mat);
   std::string Name() { return "WarpAffine"; }
 
-  bool SetTransformMatrix(const cv::Mat &trans_matrix) {
+  bool SetTransformMatrix(const cv::Mat& trans_matrix) {
     trans_matrix_ = trans_matrix;
     return true;
   }
@@ -46,13 +44,10 @@ class WarpAffine : public Processor {
     return std::make_tuple(width_, height_);
   }
 
-  static bool Run(Mat* mat,
-                  const cv::Mat& trans_matrix,
-                  int width, int height,
-                  int interp = 1,
-                  int border_mode = 0,
+  static bool Run(Mat* mat, const cv::Mat& trans_matrix, int width, int height,
+                  int interp = 1, int border_mode = 0,
                   const cv::Scalar& borderValue = cv::Scalar(),
-                  ProcLib lib = ProcLib::OPENCV);
+                  ProcLib lib = ProcLib::DEFAULT);
 
  private:
   cv::Mat trans_matrix_;
diff --git a/fastdeploy/vision/detection/contrib/yolov5.cc b/fastdeploy/vision/detection/contrib/yolov5.cc
index 6ab4646e08..27f74fd551 100644
--- a/fastdeploy/vision/detection/contrib/yolov5.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5.cc
@@ -66,7 +66,7 @@ YOLOv5::YOLOv5(const std::string& model_file, const std::string& params_file,
     valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
     valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
-    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
   }
   runtime_option = custom_option;
@@ -184,11 +184,14 @@ void YOLOv5::UseCudaPreprocessing(int max_image_size) {
   use_cuda_preprocessing_ = true;
   is_scale_up_ = true;
   if (input_img_cuda_buffer_host_ == nullptr) {
-    // prepare input data cache in GPU pinned memory 
-    CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_, max_image_size * 3));
+    // prepare input data cache in GPU pinned memory
+    CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_,
+                              max_image_size * 3));
     // prepare input data cache in GPU device memory
-    CUDA_CHECK(cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
-    CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_, 3 * size_[0] * size_[1] * sizeof(float)));
+    CUDA_CHECK(
+        cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
+    CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_,
+                          3 * size_[0] * size_[1] * sizeof(float)));
   }
 #else
   FDWARNING << "The FastDeploy didn't compile with BUILD_CUDA_SRC=ON."
@@ -197,15 +200,17 @@ void YOLOv5::UseCudaPreprocessing(int max_image_size) {
 #endif
 }
 
-bool YOLOv5::CudaPreprocess(Mat* mat, FDTensor* output,
-                            std::map<std::string, std::array<float, 2>>* im_info,
-                            const std::vector<int>& size,
-                            const std::vector<float> padding_value,
-                            bool is_mini_pad, bool is_no_pad, bool is_scale_up,
-                            int stride, float max_wh, bool multi_label) {
+bool YOLOv5::CudaPreprocess(
+    Mat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info,
+    const std::vector<int>& size, const std::vector<float> padding_value,
+    bool is_mini_pad, bool is_no_pad, bool is_scale_up, int stride,
+    float max_wh, bool multi_label) {
 #ifdef ENABLE_CUDA_PREPROCESS
   if (is_mini_pad != false || is_no_pad != false || is_scale_up != true) {
-    FDERROR << "Preprocessing with CUDA is only available when the arguments satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)." << std::endl;
+    FDERROR << "Preprocessing with CUDA is only available when the arguments "
+               "satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)."
+            << std::endl;
     return false;
   }
 
@@ -219,14 +224,15 @@ bool YOLOv5::CudaPreprocess(Mat* mat, FDTensor* output,
   int src_img_buf_size = mat->Height() * mat->Width() * mat->Channels();
   memcpy(input_img_cuda_buffer_host_, mat->Data(), src_img_buf_size);
   CUDA_CHECK(cudaMemcpyAsync(input_img_cuda_buffer_device_,
-                             input_img_cuda_buffer_host_,
-                             src_img_buf_size, cudaMemcpyHostToDevice, stream));
+                             input_img_cuda_buffer_host_, src_img_buf_size,
+                             cudaMemcpyHostToDevice, stream));
   utils::CudaYoloPreprocess(input_img_cuda_buffer_device_, mat->Width(),
                             mat->Height(), input_tensor_cuda_buffer_device_,
                             size[0], size[1], padding_value, stream);
 
   // Record output shape of preprocessed image
-  (*im_info)["output_shape"] = {static_cast<float>(size[0]), static_cast<float>(size[1])};
+  (*im_info)["output_shape"] = {static_cast<float>(size[0]),
+                                static_cast<float>(size[1])};
 
   output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                           input_tensor_cuda_buffer_device_);
@@ -339,22 +345,21 @@ bool YOLOv5::Postprocess(
 
 bool YOLOv5::Predict(cv::Mat* im, DetectionResult* result, float conf_threshold,
                      float nms_iou_threshold) {
-
   Mat mat(*im);
 
   std::map<std::string, std::array<float, 2>> im_info;
 
   if (use_cuda_preprocessing_) {
-    if (!CudaPreprocess(&mat, &reused_input_tensors[0], &im_info, size_, padding_value_,
-                        is_mini_pad_, is_no_pad_, is_scale_up_, stride_, max_wh_,
-                        multi_label_)) {
+    if (!CudaPreprocess(&mat, &reused_input_tensors[0], &im_info, size_,
+                        padding_value_, is_mini_pad_, is_no_pad_, is_scale_up_,
+                        stride_, max_wh_, multi_label_)) {
       FDERROR << "Failed to preprocess input image." << std::endl;
       return false;
     }
   } else {
-    if (!Preprocess(&mat, &reused_input_tensors[0], &im_info, size_, padding_value_,
-                    is_mini_pad_, is_no_pad_, is_scale_up_, stride_, max_wh_,
-                    multi_label_)) {
+    if (!Preprocess(&mat, &reused_input_tensors[0], &im_info, size_,
+                    padding_value_, is_mini_pad_, is_no_pad_, is_scale_up_,
+                    stride_, max_wh_, multi_label_)) {
       FDERROR << "Failed to preprocess input image." << std::endl;
       return false;
     }
diff --git a/fastdeploy/vision/detection/contrib/yolov6.cc b/fastdeploy/vision/detection/contrib/yolov6.cc
index 396dab2540..70f79b9f47 100644
--- a/fastdeploy/vision/detection/contrib/yolov6.cc
+++ b/fastdeploy/vision/detection/contrib/yolov6.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/detection/contrib/yolov6.h"
+
 #include "fastdeploy/utils/perf.h"
 #include "fastdeploy/vision/utils/utils.h"
 #ifdef ENABLE_CUDA_PREPROCESS
@@ -69,7 +70,7 @@ YOLOv6::YOLOv6(const std::string& model_file, const std::string& params_file,
     valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
     valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
-    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
   }
   runtime_option = custom_option;
@@ -174,11 +175,14 @@ void YOLOv6::UseCudaPreprocessing(int max_image_size) {
   use_cuda_preprocessing_ = true;
   is_scale_up = true;
   if (input_img_cuda_buffer_host_ == nullptr) {
-    // prepare input data cache in GPU pinned memory 
-    CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_, max_image_size * 3));
+    // prepare input data cache in GPU pinned memory
+    CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_,
+                              max_image_size * 3));
     // prepare input data cache in GPU device memory
-    CUDA_CHECK(cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
-    CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_, 3 * size[0] * size[1] * sizeof(float)));
+    CUDA_CHECK(
+        cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
+    CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_,
+                          3 * size[0] * size[1] * sizeof(float)));
   }
 #else
   FDWARNING << "The FastDeploy didn't compile with BUILD_CUDA_SRC=ON."
@@ -187,11 +191,14 @@ void YOLOv6::UseCudaPreprocessing(int max_image_size) {
 #endif
 }
 
-bool YOLOv6::CudaPreprocess(Mat* mat, FDTensor* output,
-                            std::map<std::string, std::array<float, 2>>* im_info) {
+bool YOLOv6::CudaPreprocess(
+    Mat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
 #ifdef ENABLE_CUDA_PREPROCESS
   if (is_mini_pad != false || is_no_pad != false || is_scale_up != true) {
-    FDERROR << "Preprocessing with CUDA is only available when the arguments satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)." << std::endl;
+    FDERROR << "Preprocessing with CUDA is only available when the arguments "
+               "satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)."
+            << std::endl;
     return false;
   }
 
@@ -205,14 +212,15 @@ bool YOLOv6::CudaPreprocess(Mat* mat, FDTensor* output,
   int src_img_buf_size = mat->Height() * mat->Width() * mat->Channels();
   memcpy(input_img_cuda_buffer_host_, mat->Data(), src_img_buf_size);
   CUDA_CHECK(cudaMemcpyAsync(input_img_cuda_buffer_device_,
-                             input_img_cuda_buffer_host_,
-                             src_img_buf_size, cudaMemcpyHostToDevice, stream));
+                             input_img_cuda_buffer_host_, src_img_buf_size,
+                             cudaMemcpyHostToDevice, stream));
   utils::CudaYoloPreprocess(input_img_cuda_buffer_device_, mat->Width(),
                             mat->Height(), input_tensor_cuda_buffer_device_,
                             size[0], size[1], padding_value, stream);
 
   // Record output shape of preprocessed image
-  (*im_info)["output_shape"] = {static_cast<float>(size[0]), static_cast<float>(size[1])};
+  (*im_info)["output_shape"] = {static_cast<float>(size[0]),
+                                static_cast<float>(size[1])};
 
   output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                           input_tensor_cuda_buffer_device_);
diff --git a/fastdeploy/vision/detection/contrib/yolov7.cc b/fastdeploy/vision/detection/contrib/yolov7.cc
index 295ee688e3..c3fc1de414 100644
--- a/fastdeploy/vision/detection/contrib/yolov7.cc
+++ b/fastdeploy/vision/detection/contrib/yolov7.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/detection/contrib/yolov7.h"
+
 #include "fastdeploy/utils/perf.h"
 #include "fastdeploy/vision/utils/utils.h"
 #ifdef ENABLE_CUDA_PREPROCESS
@@ -67,7 +68,7 @@ YOLOv7::YOLOv7(const std::string& model_file, const std::string& params_file,
     valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
     valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
-    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
   }
   runtime_option = custom_option;
@@ -172,11 +173,14 @@ void YOLOv7::UseCudaPreprocessing(int max_image_size) {
   use_cuda_preprocessing_ = true;
   is_scale_up = true;
   if (input_img_cuda_buffer_host_ == nullptr) {
-    // prepare input data cache in GPU pinned memory 
-    CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_, max_image_size * 3));
+    // prepare input data cache in GPU pinned memory
+    CUDA_CHECK(cudaMallocHost((void**)&input_img_cuda_buffer_host_,
+                              max_image_size * 3));
     // prepare input data cache in GPU device memory
-    CUDA_CHECK(cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
-    CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_, 3 * size[0] * size[1] * sizeof(float)));
+    CUDA_CHECK(
+        cudaMalloc((void**)&input_img_cuda_buffer_device_, max_image_size * 3));
+    CUDA_CHECK(cudaMalloc((void**)&input_tensor_cuda_buffer_device_,
+                          3 * size[0] * size[1] * sizeof(float)));
   }
 #else
   FDWARNING << "The FastDeploy didn't compile with BUILD_CUDA_SRC=ON."
@@ -185,11 +189,14 @@ void YOLOv7::UseCudaPreprocessing(int max_image_size) {
 #endif
 }
 
-bool YOLOv7::CudaPreprocess(Mat* mat, FDTensor* output,
-                            std::map<std::string, std::array<float, 2>>* im_info) {
+bool YOLOv7::CudaPreprocess(
+    Mat* mat, FDTensor* output,
+    std::map<std::string, std::array<float, 2>>* im_info) {
 #ifdef ENABLE_CUDA_PREPROCESS
   if (is_mini_pad != false || is_no_pad != false || is_scale_up != true) {
-    FDERROR << "Preprocessing with CUDA is only available when the arguments satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)." << std::endl;
+    FDERROR << "Preprocessing with CUDA is only available when the arguments "
+               "satisfy (is_mini_pad=false, is_no_pad=false, is_scale_up=true)."
+            << std::endl;
     return false;
   }
 
@@ -203,14 +210,15 @@ bool YOLOv7::CudaPreprocess(Mat* mat, FDTensor* output,
   int src_img_buf_size = mat->Height() * mat->Width() * mat->Channels();
   memcpy(input_img_cuda_buffer_host_, mat->Data(), src_img_buf_size);
   CUDA_CHECK(cudaMemcpyAsync(input_img_cuda_buffer_device_,
-                             input_img_cuda_buffer_host_,
-                             src_img_buf_size, cudaMemcpyHostToDevice, stream));
+                             input_img_cuda_buffer_host_, src_img_buf_size,
+                             cudaMemcpyHostToDevice, stream));
   utils::CudaYoloPreprocess(input_img_cuda_buffer_device_, mat->Width(),
                             mat->Height(), input_tensor_cuda_buffer_device_,
                             size[0], size[1], padding_value, stream);
 
   // Record output shape of preprocessed image
-  (*im_info)["output_shape"] = {static_cast<float>(size[0]), static_cast<float>(size[1])};
+  (*im_info)["output_shape"] = {static_cast<float>(size[0]),
+                                static_cast<float>(size[1])};
 
   output->SetExternalData({mat->Channels(), size[0], size[1]}, FDDataType::FP32,
                           input_tensor_cuda_buffer_device_);
diff --git a/fastdeploy/vision/matting/contrib/modnet.cc b/fastdeploy/vision/matting/contrib/modnet.cc
old mode 100755
new mode 100644
index b082665473..b0bc59c66f
--- a/fastdeploy/vision/matting/contrib/modnet.cc
+++ b/fastdeploy/vision/matting/contrib/modnet.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/matting/contrib/modnet.h"
+
 #include "fastdeploy/utils/perf.h"
 #include "fastdeploy/vision/utils/utils.h"
 
@@ -26,8 +27,8 @@ MODNet::MODNet(const std::string& model_file, const std::string& params_file,
                const RuntimeOption& custom_option,
                const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+    valid_cpu_backends = {Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -118,7 +119,7 @@ bool MODNet::Postprocess(
   int numel = ipt_h * ipt_w;
   int nbytes = numel * sizeof(float);
   result->Resize(numel);
-  std::memcpy(result->alpha.data(), alpha_resized.GetOpenCVMat()->data, nbytes);
+  std::memcpy(result->alpha.data(), alpha_resized.Data(), nbytes);
   return true;
 }
 
diff --git a/fastdeploy/vision/matting/contrib/rvm.cc b/fastdeploy/vision/matting/contrib/rvm.cc
old mode 100755
new mode 100644
index 04b9b93164..ec8ed19fc4
--- a/fastdeploy/vision/matting/contrib/rvm.cc
+++ b/fastdeploy/vision/matting/contrib/rvm.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/matting/contrib/rvm.h"
+
 #include "fastdeploy/utils/perf.h"
 #include "fastdeploy/vision/utils/utils.h"
 
@@ -22,12 +23,13 @@ namespace vision {
 
 namespace matting {
 
-RobustVideoMatting::RobustVideoMatting(const std::string& model_file, const std::string& params_file,
-               const RuntimeOption& custom_option,
-               const ModelFormat& model_format) {
+RobustVideoMatting::RobustVideoMatting(const std::string& model_file,
+                                       const std::string& params_file,
+                                       const RuntimeOption& custom_option,
+                                       const ModelFormat& model_format) {
   if (model_format == ModelFormat::ONNX) {
-    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT}; 
-    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT};
+    valid_gpu_backends = {Backend::ORT, Backend::TRT};
   } else {
     valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -52,8 +54,9 @@ bool RobustVideoMatting::Initialize() {
   return true;
 }
 
-bool RobustVideoMatting::Preprocess(Mat* mat, FDTensor* output,
-                        std::map<std::string, std::array<int, 2>>* im_info) {
+bool RobustVideoMatting::Preprocess(
+    Mat* mat, FDTensor* output,
+    std::map<std::string, std::array<int, 2>>* im_info) {
   // Resize
   int resize_w = size[0];
   int resize_h = size[1];
@@ -61,7 +64,7 @@ bool RobustVideoMatting::Preprocess(Mat* mat, FDTensor* output,
     Resize::Run(mat, resize_w, resize_h);
   }
   BGR2RGB::Run(mat);
-  
+
   // Normalize
   std::vector<float> alpha = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
   std::vector<float> beta = {0.0f, 0.0f, 0.0f};
@@ -83,7 +86,7 @@ bool RobustVideoMatting::Postprocess(
   FDASSERT((infer_result.size() == 6),
            "The default number of output tensor must be 6 according to "
            "RobustVideoMatting.");
-  FDTensor& fgr = infer_result.at(0); // fgr (1, 3, h, w) 0.~1.
+  FDTensor& fgr = infer_result.at(0);    // fgr (1, 3, h, w) 0.~1.
   FDTensor& alpha = infer_result.at(1);  // alpha (1, 1, h, w) 0.~1.
   FDASSERT((fgr.shape[0] == 1), "Only support batch = 1 now.");
   FDASSERT((alpha.shape[0] == 1), "Only support batch = 1 now.");
@@ -98,11 +101,11 @@ bool RobustVideoMatting::Postprocess(
   // update context
   if (video_mode) {
     for (size_t i = 0; i < 4; ++i) {
-      FDTensor& rki = infer_result.at(i+2);
+      FDTensor& rki = infer_result.at(i + 2);
       dynamic_inputs_dims_[i] = rki.shape;
       dynamic_inputs_datas_[i].resize(rki.Numel());
       memcpy(dynamic_inputs_datas_[i].data(), rki.Data(),
-              rki.Numel() * FDDataTypeSize(rki.dtype));
+             rki.Numel() * FDDataTypeSize(rki.dtype));
     }
   }
 
@@ -139,8 +142,8 @@ bool RobustVideoMatting::Postprocess(
   int numel = in_h * in_w;
   int nbytes = numel * sizeof(float);
   result->Resize(numel);
-  memcpy(result->alpha.data(), alpha_resized.GetOpenCVMat()->data, nbytes);
-  memcpy(result->foreground.data(), fgr_resized.GetOpenCVMat()->data, nbytes);
+  memcpy(result->alpha.data(), alpha_resized.Data(), nbytes);
+  memcpy(result->foreground.data(), fgr_resized.Data(), nbytes);
   return true;
 }
 
@@ -154,7 +157,9 @@ bool RobustVideoMatting::Predict(cv::Mat* im, MattingResult* result) {
   im_info["output_shape"] = {mat.Height(), mat.Width()};
   // convert vector to FDTensor
   for (size_t i = 1; i < inputs_nums; ++i) {
-    input_tensors[i].SetExternalData(dynamic_inputs_dims_[i-1], FDDataType::FP32, dynamic_inputs_datas_[i-1].data());
+    input_tensors[i].SetExternalData(dynamic_inputs_dims_[i - 1],
+                                     FDDataType::FP32,
+                                     dynamic_inputs_datas_[i - 1].data());
     input_tensors[i].device = Device::CPU;
   }
   if (!Preprocess(&mat, &input_tensors[0], &im_info)) {
diff --git a/fastdeploy/vision/matting/ppmatting/ppmatting.cc b/fastdeploy/vision/matting/ppmatting/ppmatting.cc
old mode 100755
new mode 100644
index 5fa63a48dd..cde2183075
--- a/fastdeploy/vision/matting/ppmatting/ppmatting.cc
+++ b/fastdeploy/vision/matting/ppmatting/ppmatting.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/vision/matting/ppmatting/ppmatting.h"
+
 #include "fastdeploy/vision/utils/utils.h"
 #include "yaml-cpp/yaml.h"
 
@@ -171,15 +172,15 @@ bool PPMatting::Postprocess(
   std::vector<int64_t> dim{0, 2, 3, 1};
   Transpose(alpha_tensor, &alpha_tensor, dim);
   alpha_tensor.Squeeze(0);
-  Mat mat = CreateFromTensor(alpha_tensor);
+  Mat mat = CreateFDMatFromTensor(alpha_tensor);
 
   auto iter_ipt = im_info.find("input_shape");
   auto iter_out = im_info.find("output_shape");
-  if (is_fixed_input_shape_){
+  if (is_fixed_input_shape_) {
     double scale_h = static_cast<double>(iter_out->second[0]) /
-                    static_cast<double>(iter_ipt->second[0]);
+                     static_cast<double>(iter_ipt->second[0]);
     double scale_w = static_cast<double>(iter_out->second[1]) /
-                    static_cast<double>(iter_ipt->second[1]);
+                     static_cast<double>(iter_ipt->second[1]);
     double actual_scale = std::min(scale_h, scale_w);
 
     int size_before_pad_h = round(actual_scale * iter_ipt->second[0]);
@@ -188,7 +189,8 @@ bool PPMatting::Postprocess(
     Crop::Run(&mat, 0, 0, size_before_pad_w, size_before_pad_h);
   }
 
-  Resize::Run(&mat, iter_ipt->second[1], iter_ipt->second[0]);
+  Resize::Run(&mat, iter_ipt->second[1], iter_ipt->second[0], -1.0f, -1.0f, 1,
+              false, ProcLib::OPENCV);
 
   result->Clear();
   // note: must be setup shape before Resize
@@ -197,7 +199,7 @@ bool PPMatting::Postprocess(
   int numel = iter_ipt->second[0] * iter_ipt->second[1];
   int nbytes = numel * sizeof(float);
   result->Resize(numel);
-  std::memcpy(result->alpha.data(), mat.GetOpenCVMat()->data, nbytes);
+  std::memcpy(result->alpha.data(), mat.Data(), nbytes);
   return true;
 }
 
diff --git a/fastdeploy/vision/segmentation/ppseg/model.cc b/fastdeploy/vision/segmentation/ppseg/model.cc
old mode 100755
new mode 100644
index 3c6318232f..58819ef15c
--- a/fastdeploy/vision/segmentation/ppseg/model.cc
+++ b/fastdeploy/vision/segmentation/ppseg/model.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/segmentation/ppseg/model.h"
+
 #include "fastdeploy/vision/utils/utils.h"
 #include "yaml-cpp/yaml.h"
 
@@ -26,7 +27,8 @@ PaddleSegModel::PaddleSegModel(const std::string& model_file,
                                const RuntimeOption& custom_option,
                                const ModelFormat& model_format) {
   config_file_ = config_file;
-  valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::ORT, Backend::LITE};
+  valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::ORT,
+                        Backend::LITE};
   valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
   valid_rknpu_backends = {Backend::RKNPU2};
   runtime_option = custom_option;
@@ -68,7 +70,7 @@ bool PaddleSegModel::BuildPreprocessPipelineFromConfig() {
       FDASSERT(op.IsMap(),
                "Require the transform information in yaml be Map type.");
       if (op["type"].as<std::string>() == "Normalize") {
-        if(!(this->disable_normalize_and_permute)){
+        if (!(this->disable_normalize_and_permute)) {
           std::vector<float> mean = {0.5, 0.5, 0.5};
           std::vector<float> std = {0.5, 0.5, 0.5};
           if (op["mean"]) {
@@ -102,12 +104,13 @@ bool PaddleSegModel::BuildPreprocessPipelineFromConfig() {
     int input_width = input_shape[3].as<int>();
     if (input_height == -1 || input_width == -1) {
       FDWARNING << "The exported PaddleSeg model is with dynamic shape input, "
-	        << "which is not supported by ONNX Runtime and Tensorrt. "
-		<< "Only OpenVINO and Paddle Inference are available now. "
-	        << "For using ONNX Runtime or Tensorrt, "
-	        << "Please refer to https://github.com/PaddlePaddle/PaddleSeg/blob/develop/docs/model_export.md"
-	        << " to export model with fixed input shape."
-	        << std::endl;
+                << "which is not supported by ONNX Runtime and Tensorrt. "
+                << "Only OpenVINO and Paddle Inference are available now. "
+                << "For using ONNX Runtime or Tensorrt, "
+                << "Please refer to "
+                   "https://github.com/PaddlePaddle/PaddleSeg/blob/develop/"
+                   "docs/model_export.md"
+                << " to export model with fixed input shape." << std::endl;
       valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::LITE};
       valid_gpu_backends = {Backend::PDINFER};
     }
@@ -132,7 +135,7 @@ bool PaddleSegModel::BuildPreprocessPipelineFromConfig() {
               << "." << std::endl;
     }
   }
-  if(!(this->disable_normalize_and_permute)){
+  if (!(this->disable_normalize_and_permute)) {
     processors_.push_back(std::make_shared<HWC2CHW>());
   }
   return true;
@@ -260,7 +263,7 @@ bool PaddleSegModel::Postprocess(
           infer_result->shape, FDDataType::FP32,
           static_cast<void*>(fp32_result_buffer->data()));
     }
-    mat = new Mat(CreateFromTensor(*infer_result));
+    mat = new Mat(CreateFDMatFromTensor(*infer_result));
     Resize::Run(mat, ipt_w, ipt_h, -1.0f, -1.0f, 1);
     mat->ShareWithTensor(&new_infer_result);
     result->shape = new_infer_result.shape;
@@ -361,11 +364,13 @@ bool PaddleSegModel::Predict(cv::Mat* im, SegmentationResult* result) {
   return true;
 }
 
-void PaddleSegModel::DisableNormalizeAndPermute(){
+void PaddleSegModel::DisableNormalizeAndPermute() {
   this->disable_normalize_and_permute = true;
-  // the DisableNormalizeAndPermute function will be invalid if the configuration file is loaded during preprocessing
+  // the DisableNormalizeAndPermute function will be invalid if the
+  // configuration file is loaded during preprocessing
   if (!BuildPreprocessPipelineFromConfig()) {
-    FDERROR << "Failed to build preprocess pipeline from configuration file." << std::endl;
+    FDERROR << "Failed to build preprocess pipeline from configuration file."
+            << std::endl;
   }
 }
 
diff --git a/fastdeploy/vision/tracking/pptracking/trajectory.h b/fastdeploy/vision/tracking/pptracking/trajectory.h
index 793419ce13..792a5b8a2b 100644
--- a/fastdeploy/vision/tracking/pptracking/trajectory.h
+++ b/fastdeploy/vision/tracking/pptracking/trajectory.h
@@ -75,7 +75,8 @@ class FASTDEPLOY_DECL Trajectory : public TKalmanFilter {
                       int timestamp,
                       bool update_embedding = true);
   virtual void activate(int &cnt, int timestamp);
-  virtual void reactivate(Trajectory *traj, int &cnt, int timestamp, bool newid = false);
+  virtual void reactivate(Trajectory *traj, int &cnt,
+                          int timestamp, bool newid = false);
   virtual void mark_lost(void);
   virtual void mark_removed(void);
 
@@ -229,6 +230,6 @@ inline void Trajectory::mark_lost(void) { state = Lost; }
 
 inline void Trajectory::mark_removed(void) { state = Removed; }
 
-} // namespace tracking
-} // namespace vision
-} // namespace fastdeploy
+}  // namespace tracking
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/utils/crop_image.cc b/fastdeploy/vision/utils/crop_image.cc
index 5d15844b53..1dd545c0f0 100644
--- a/fastdeploy/vision/utils/crop_image.cc
+++ b/fastdeploy/vision/utils/crop_image.cc
@@ -18,9 +18,9 @@ namespace fastdeploy {
 namespace vision {
 namespace utils {
 
-bool CropImageByBox(const Mat& src_im, Mat* dst_im,
-               const std::vector<float>& box, std::vector<float>* center,
-               std::vector<float>* scale, const float expandratio) {
+bool CropImageByBox(Mat& src_im, Mat* dst_im,
+                    const std::vector<float>& box, std::vector<float>* center,
+                    std::vector<float>* scale, const float expandratio) {
   const cv::Mat* img = src_im.GetOpenCVMat();
   cv::Mat* crop_img = dst_im->GetOpenCVMat();
   int xmin = static_cast<int>(box[0]);
@@ -34,12 +34,12 @@ bool CropImageByBox(const Mat& src_im, Mat* dst_im,
   // adjust h or w to keep image ratio, expand the shorter edge
   if (half_h * 3 > half_w * 4) {
     half_w = half_h * 0.75;
-  } 
-  int crop_xmin =std::max(0, static_cast<int>(centerx - half_w));
-  int crop_ymin =std::max(0, static_cast<int>(centery - half_h));
+  }
+  int crop_xmin = std::max(0, static_cast<int>(centerx - half_w));
+  int crop_ymin = std::max(0, static_cast<int>(centery - half_h));
   int crop_xmax = std::min(img->cols - 1, static_cast<int>(centerx + half_w));
   int crop_ymax = std::min(img->rows - 1, static_cast<int>(centery + half_h));
-  
+
   crop_img->create(crop_ymax - crop_ymin, crop_xmax - crop_xmin, img->type());
   *crop_img =
       (*img)(cv::Range(crop_ymin, crop_ymax), cv::Range(crop_xmin, crop_xmax));
diff --git a/fastdeploy/vision/utils/utils.h b/fastdeploy/vision/utils/utils.h
index 4d6a006c32..1590922d8a 100644
--- a/fastdeploy/vision/utils/utils.h
+++ b/fastdeploy/vision/utils/utils.h
@@ -17,6 +17,7 @@
 #include <opencv2/opencv.hpp>
 #include <set>
 #include <vector>
+
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/utils/utils.h"
 #include "fastdeploy/vision/common/result.h"
@@ -43,7 +44,7 @@ std::vector<int32_t> TopKIndices(const T* array, int array_size, int topk) {
   std::vector<int32_t> res(topk);
   std::set<int32_t> searched;
   for (int32_t i = 0; i < topk; ++i) {
-    T min = -99999999;
+    T min = static_cast<T>(-99999999);
     for (int32_t j = 0; j < array_size; ++j) {
       if (searched.find(j) != searched.end()) {
         continue;
@@ -75,23 +76,25 @@ FASTDEPLOY_DECL float CosineSimilarity(const std::vector<float>& a,
                                        const std::vector<float>& b,
                                        bool normalized = true);
 
-bool CropImageByBox(const Mat& src_im, Mat* dst_im,
+bool CropImageByBox(Mat& src_im, Mat* dst_im,
                     const std::vector<float>& box, std::vector<float>* center,
                     std::vector<float>* scale, const float expandratio = 0.3);
 
 /**
-  * Function: for keypoint detection model, fine positioning of keypoints in postprocess
-  * Parameters:
-  * heatmap: model inference results for keypoint detection models
-  * dim: shape information of the inference result
-  * coords: coordinates after refined positioning
-  * px: px = int(coords[ch * 2] + 0.5) , refer to API detection::GetFinalPredictions
-  * py: px = int(coords[ch * 2 + 1] + 0.5), refer to API detection::GetFinalPredictions
-  * index: index information of heatmap pixels
-  * ch: channel
-  * Paper reference: DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
-  *         Representation for Human Pose Estimation (CVPR 2020).
-  */
+ * Function: for keypoint detection model, fine positioning of keypoints in
+ * postprocess
+ * Parameters:
+ * heatmap: model inference results for keypoint detection models
+ * dim: shape information of the inference result
+ * coords: coordinates after refined positioning
+ * px: px = int(coords[ch * 2] + 0.5) , refer to API detection::GetFinalPredictions
+ * py: px = int(coords[ch * 2 + 1] + 0.5), refer to API detection::GetFinalPredictions
+ * index: index information of heatmap pixels
+ * ch: channel
+ * Paper reference: DARK postpocessing, Zhang et al.
+ * Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR
+ * 2020).
+ */
 void DarkParse(const std::vector<float>& heatmap, const std::vector<int>& dim,
                std::vector<float>* coords, const int px, const int py,
                const int index, const int ch);

From ce828ecb386a85e5e9889ab00ac49fb882d5a978 Mon Sep 17 00:00:00 2001
From: Zheng_Bicheng <58363586+Zheng-Bicheng@users.noreply.github.com>
Date: Fri, 4 Nov 2022 09:39:23 +0800
Subject: [PATCH 12/18] =?UTF-8?q?[Backend=20And=20DOC]=20=E6=94=B9?=
 =?UTF-8?q?=E8=BF=9Bppseg=E6=96=87=E6=A1=A3=20+=20=E4=B8=BARKNPU2=E5=90=8E?=
 =?UTF-8?q?=E7=AB=AF=E6=96=B0=E5=A2=9E=E5=AF=B9=E5=A4=9A=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=9A=84=E6=94=AF=E6=8C=81=20(#491)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 11-02/14:35
* 新增输入数据format错误判断
* 优化推理过程，减少内存分配次数
* 支持多输入rknn模型
* rknn模型输出shape为三维时，输出将被强制对齐为4纬。现在将直接抹除rknn补充的shape，方便部分对输出shape进行判断的模型进行正确的后处理。

* 11-03/17:25
* 支持导出多输入RKNN模型
* 更新各种文档
* ppseg改用Fastdeploy中的模型进行转换

* 11-03/17:25
* 新增开源头

* 11-03/21:48
* 删除无用debug代码，补充注释
---
 docs/cn/faq/rknpu2/rknpu2.md                  |   4 +-
 .../segmentation/paddleseg/rknpu2/README.md   | 128 +++++++---
 .../paddleseg/rknpu2/cpp/README.md            |  10 +-
 .../paddleseg/rknpu2/cpp/infer.cc             |  19 +-
 .../paddleseg/rknpu2/python/README.md         |  10 +-
 .../paddleseg/rknpu2/python/infer.py          |  19 +-
 .../backends/rknpu/rknpu2/rknpu2_backend.cc   | 234 ++++++++++--------
 .../backends/rknpu/rknpu2/rknpu2_backend.h    |   9 +-
 ...rait_PP_HumanSegV2_Lite_256x144_infer.yaml |   7 +
 ...rait_PP_HumanSegV2_Lite_256x144_infer.yaml |   7 +
 tools/rknpu2/config/ppseg_config.yaml         |   7 -
 tools/rknpu2/export.py                        |  24 +-
 12 files changed, 303 insertions(+), 175 deletions(-)
 create mode 100644 tools/rknpu2/config/RK3568/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
 create mode 100644 tools/rknpu2/config/RK3588/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
 delete mode 100644 tools/rknpu2/config/ppseg_config.yaml

diff --git a/docs/cn/faq/rknpu2/rknpu2.md b/docs/cn/faq/rknpu2/rknpu2.md
index 1e40585d12..775d50b50b 100644
--- a/docs/cn/faq/rknpu2/rknpu2.md
+++ b/docs/cn/faq/rknpu2/rknpu2.md
@@ -1,5 +1,8 @@
 # RKNPU2模型部署
 
+## 安装环境
+RKNPU2模型导出只支持在x86Linux平台上进行导出，安装流程请参考[RKNPU2模型导出环境配置文档](./install_rknn_toolkit2.md)
+
 ## ONNX模型转换为RKNN模型
 ONNX模型不能直接调用RK芯片中的NPU进行运算，需要把ONNX模型转换为RKNN模型，具体流程请查看[转换文档](./export.md)
 
@@ -61,4 +64,3 @@ int infer_scrfd_npu() {
 - [rknpu2板端环境安装配置](../../build_and_install/rknpu2.md)
 - [rknn_toolkit2安装文档](./install_rknn_toolkit2.md)
 - [onnx转换rknn文档](./export.md)
-
diff --git a/examples/vision/segmentation/paddleseg/rknpu2/README.md b/examples/vision/segmentation/paddleseg/rknpu2/README.md
index 5a96661b18..cf708fe2cb 100644
--- a/examples/vision/segmentation/paddleseg/rknpu2/README.md
+++ b/examples/vision/segmentation/paddleseg/rknpu2/README.md
@@ -4,49 +4,103 @@
 
 - [PaddleSeg develop](https://github.com/PaddlePaddle/PaddleSeg/tree/develop)
 
-目前FastDeploy支持如下模型的部署
+目前FastDeploy使用RKNPU2推理PPSeg支持如下模型的部署:
 
-- [U-Net系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/unet/README.md)
-- [PP-LiteSeg系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/pp_liteseg/README.md)
-- [PP-HumanSeg系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/contrib/PP-HumanSeg/README.md)
-- [FCN系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/fcn/README.md)
-- [DeepLabV3系列模型](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.6/configs/deeplabv3/README.md)
-
-【注意】如你部署的为**PP-Matting**、**PP-HumanMatting**以及**ModNet**请参考[Matting模型部署](../../matting)
+| 模型                                                                                                                                           | 参数文件大小 | 输入Shape  | mIoU   | mIoU (flip) | mIoU (ms+flip) |
+|:---------------------------------------------------------------------------------------------------------------------------------------------|:-------|:---------|:-------|:------------|:---------------|
+| [Unet-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/Unet_cityscapes_without_argmax_infer.tgz)                                       | 52MB   | 1024x512 | 65.00% | 66.02%      | 66.89%         |
+| [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer.tgz)          | 31MB   | 1024x512 | 77.04% | 77.73%      | 77.46%         |
+| [PP-HumanSegV1-Lite(通用人像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/PP_HumanSegV1_Lite_infer.tgz)                                      | 543KB  | 192x192  | 86.2%  | -           | -              |
+| [PP-HumanSegV2-Lite(通用人像分割模型)](https://bj.bcebos.com/paddle2onnx/libs/PP_HumanSegV2_Lite_192x192_infer.tgz)                                  | 12MB   | 192x192  | 92.52% | -           | -              |
+| [PP-HumanSegV2-Mobile(通用人像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/PP_HumanSegV2_Mobile_192x192_infer.tgz)                          | 29MB   | 192x192  | 93.13% | -           | -              |
+| [PP-HumanSegV1-Server(通用人像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/PP_HumanSegV1_Server_infer.tgz)                                  | 103MB  | 512x512  | 96.47% | -           | -              |
+| [Portait-PP-HumanSegV2_Lite(肖像分割模型)](https://bj.bcebos.com/paddlehub/fastdeploy/Portrait_PP_HumanSegV2_Lite_256x144_infer.tgz)               | 3.6M   | 256x144  | 96.63% | -           | -              |
+| [FCN-HRNet-W18-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/FCN_HRNet_W18_cityscapes_without_argmax_infer.tgz)                     | 37MB   | 1024x512 | 78.97% | 79.49%      | 79.74%         |
+| [Deeplabv3-ResNet101-OS8-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/Deeplabv3_ResNet101_OS8_cityscapes_without_argmax_infer.tgz) | 150MB  | 1024x512 | 79.90% | 80.22%      | 80.47%         |
 
 ## 准备PaddleSeg部署模型以及转换模型
+RKNPU部署模型前需要将Paddle模型转换成RKNN模型，具体步骤如下:
+* Paddle动态图模型转换为ONNX模型，请参考[PaddleSeg模型导出说明](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.6/contrib/PP-HumanSeg)
+* ONNX模型转换RKNN模型的过程，请参考[转换文档](../../../../../docs/cn/faq/rknpu2/export.md)进行转换。
 
-RKNPU部署模型前需要将模型转换成RKNN模型，其过程一般可以简化为如下步骤:
-*   Paddle动态图模型 -> ONNX模型 -> RKNN模型。
-    *   Paddle动态图模型 转换为 ONNX模型的过程请参考([PaddleSeg模型导出说明](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.6/contrib/PP-HumanSeg))。
-    *   对于ONNX模型 转换 RKNN模型的过程，请参考[转换文档](../../../../../docs/cn/faq/rknpu2/export.md)进行转换。
-        以PPHumanSeg为例，在获取到ONNX模型后，其转换为RK3588步骤如下:
-        * 编写config.yaml文件
-        ```yaml
-        model_path: ./portrait_pp_humansegv2_lite_256x144_pretrained.onnx
-        output_folder: ./
-        target_platform: RK3588
-        normalize:
-        mean: [0.5,0.5,0.5]
-        std: [0.5,0.5,0.5]
-        outputs: None
-        ```
-        * 执行转换代码
-        ```bash
-        python /path/to/fastDeploy/toosl/export.py --config_path=/path/to/fastdeploy/tools/rknpu2/config/ppset_config.yaml
-        ```
-        
-## 下载预训练模型
+## 模型转换example
 
-为了方便开发者的测试，下面提供了PaddleSeg导出的部分模型（导出方式为：**指定**`--input_shape`，**指定**`--output_op none`，**指定**`--without_argmax`），开发者可直接下载使用。
+下面以Portait-PP-HumanSegV2_Lite(肖像分割模型)为例子，教大家如何转换PPSeg模型到RKNN模型。
+```bash
+# 下载Paddle2ONNX仓库
+git clone https://github.com/PaddlePaddle/Paddle2ONNX
 
-| 任务场景             | 模型                | 模型版本(表示已经测试的版本)                                                                                                                            | 大小  | ONNX/RKNN是否支持 | ONNX/RKNN速度(ms) |
-|------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------|-----|---------------|-----------------|
-| Segmentation     | PP-LiteSeg        | [PP_LiteSeg_T_STDC1_cityscapes](https://bj.bcebos.com/fastdeploy/models/rknn2/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_3588.tgz) | -   | True/True     | 6634/5598       |
-| Segmentation     | PP-HumanSegV2Lite | [portrait](https://bj.bcebos.com/fastdeploy/models/rknn2/portrait_pp_humansegv2_lite_256x144_inference_model_without_softmax_3588.tgz)     | -   | True/True     | 456/266         |
-| Segmentation     | PP-HumanSegV2Lite | [human](https://bj.bcebos.com/fastdeploy/models/rknn2/human_pp_humansegv2_lite_192x192_pretrained_3588.tgz)                                | -   | True/True     | 496/256         |
+# 下载Paddle静态图模型并为Paddle静态图模型固定输入shape
+## 进入为Paddle静态图模型固定输入shape的目录
+cd Paddle2ONNX/tools/paddle
+## 下载Paddle静态图模型并解压
+wget https://bj.bcebos.com/paddlehub/fastdeploy/Portrait_PP_HumanSegV2_Lite_256x144_infer.tgz
+tar xvf Portrait_PP_HumanSegV2_Lite_256x144_infer.tgz
+python paddle_infer_shape.py --model_dir Portrait_PP_HumanSegV2_Lite_256x144_infer/ \
+                             --model_filename model.pdmodel \
+                             --params_filename model.pdiparams \
+                             --save_dir Portrait_PP_HumanSegV2_Lite_256x144_infer \
+                             --input_shape_dict="{'x':[1,3,144,256]}"
+
+# 静态图转ONNX模型，注意，这里的save_file请和压缩包名对齐
+paddle2onnx --model_dir Portrait_PP_HumanSegV2_Lite_256x144_infer \
+            --model_filename model.pdmodel \
+            --params_filename model.pdiparams \
+            --save_file Portrait_PP_HumanSegV2_Lite_256x144_infer/Portrait_PP_HumanSegV2_Lite_256x144_infer.onnx \
+            --enable_dev_version True
+
+# ONNX模型转RKNN模型
+# 将ONNX模型目录拷贝到Fastdeploy根目录
+cp -r ./Portrait_PP_HumanSegV2_Lite_256x144_infer /path/to/Fastdeploy
+# 转换模型,模型将生成在Portrait_PP_HumanSegV2_Lite_256x144_infer目录下
+python tools/rknpu2/export.py --config_path tools/rknpu2/config/RK3588/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
+```
+
+## 修改yaml配置文件
+
+在**模型转换example**中，我们对模型的shape进行了固定，因此对应的yaml文件也要进行修改，如下:
+
+**原yaml文件**
+```yaml
+Deploy:
+  input_shape:
+  - -1
+  - 3
+  - -1
+  - -1
+  model: model.pdmodel
+  output_dtype: float32
+  output_op: none
+  params: model.pdiparams
+  transforms:
+  - target_size:
+    - 256
+    - 144
+    type: Resize
+  - type: Normalize
+```
+
+**修改后的yaml文件**
+```yaml
+Deploy:
+  input_shape:
+  - 1
+  - 3
+  - 144
+  - 256
+  model: model.pdmodel
+  output_dtype: float32
+  output_op: none
+  params: model.pdiparams
+  transforms:
+  - target_size:
+    - 256
+    - 144
+    type: Resize
+  - type: Normalize
+```
 
 ## 详细部署文档
-- [RKNN总体部署教程](../../../../../docs/cn/faq/rknpu2.md)
+- [RKNN总体部署教程](../../../../../docs/cn/faq/rknpu2/rknpu2.md)
 - [C++部署](cpp)
-- [Python部署](python)
\ No newline at end of file
+- [Python部署](python)
diff --git a/examples/vision/segmentation/paddleseg/rknpu2/cpp/README.md b/examples/vision/segmentation/paddleseg/rknpu2/cpp/README.md
index 17defad017..0b47f0baf9 100644
--- a/examples/vision/segmentation/paddleseg/rknpu2/cpp/README.md
+++ b/examples/vision/segmentation/paddleseg/rknpu2/cpp/README.md
@@ -41,13 +41,7 @@ fastdeploy-0.0.3目录，请移动它至thirdpartys目录下.
 
 ### 拷贝模型文件，以及配置文件至model文件夹
 在Paddle动态图模型 -> Paddle静态图模型 -> ONNX模型的过程中，将生成ONNX文件以及对应的yaml配置文件，请将配置文件存放到model文件夹内。
-转换为RKNN后的模型文件也需要拷贝至model，这里提供了转换好的文件，输入以下命令下载使用(模型文件为RK3588，RK3568需要重新[转换PPSeg RKNN模型](../README.md))。
-```bash
-cd model
-wget https://bj.bcebos.com/fastdeploy/models/rknn2/human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-tar xvf human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-cp -r ./human_pp_humansegv2_lite_192x192_pretrained_3588 ./model
-```
+转换为RKNN后的模型文件也需要拷贝至model，输入以下命令下载使用(模型文件为RK3588，RK3568需要重新[转换PPSeg RKNN模型](../README.md))。
 
 ### 准备测试图片至image文件夹
 ```bash
@@ -81,4 +75,4 @@ RKNPU上对模型的输入要求是使用NHWC格式，且图片归一化操作
 
 - [模型介绍](../../)
 - [Python部署](../python)
-- [转换PPSeg RKNN模型文档](../README.md)
\ No newline at end of file
+- [转换PPSeg RKNN模型文档](../README.md)
diff --git a/examples/vision/segmentation/paddleseg/rknpu2/cpp/infer.cc b/examples/vision/segmentation/paddleseg/rknpu2/cpp/infer.cc
index b379a5739a..9004828b77 100644
--- a/examples/vision/segmentation/paddleseg/rknpu2/cpp/infer.cc
+++ b/examples/vision/segmentation/paddleseg/rknpu2/cpp/infer.cc
@@ -1,3 +1,16 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <iostream>
 #include <string>
 #include "fastdeploy/vision.h"
@@ -40,11 +53,11 @@ std::string GetModelPath(std::string& model_path, const std::string& device) {
 
 void InferHumanPPHumansegv2Lite(const std::string& device) {
   std::string model_file =
-      "./model/human_pp_humansegv2_lite_192x192_pretrained_3588/"
-      "human_pp_humansegv2_lite_192x192_pretrained_3588.";
+      "./model/Portrait_PP_HumanSegV2_Lite_256x144_infer/"
+      "Portrait_PP_HumanSegV2_Lite_256x144_infer_rk3588.";
   std::string params_file;
   std::string config_file =
-      "./model/human_pp_humansegv2_lite_192x192_pretrained_3588/deploy.yaml";
+      "./model/Portrait_PP_HumanSegV2_Lite_256x144_infer/deploy.yaml";
 
   fastdeploy::RuntimeOption option = GetOption(device);
   fastdeploy::ModelFormat format = GetFormat(device);
diff --git a/examples/vision/segmentation/paddleseg/rknpu2/python/README.md b/examples/vision/segmentation/paddleseg/rknpu2/python/README.md
index 6e8eaf1991..74aeed2a07 100644
--- a/examples/vision/segmentation/paddleseg/rknpu2/python/README.md
+++ b/examples/vision/segmentation/paddleseg/rknpu2/python/README.md
@@ -2,7 +2,7 @@
 
 在部署前，需确认以下两个步骤
 
-- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/rknpu2.md) 
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/rknpu2.md)
 
 【注意】如你部署的为**PP-Matting**、**PP-HumanMatting**以及**ModNet**请参考[Matting模型部署](../../../matting)
 
@@ -13,17 +13,13 @@
 git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd FastDeploy/examples/vision/segmentation/paddleseg/python
 
-# 下载模型
-wget https://bj.bcebos.com/fastdeploy/models/rknn2/human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-tar xvf human_pp_humansegv2_lite_192x192_pretrained_3588.tgz
-
 # 下载图片
 wget https://paddleseg.bj.bcebos.com/dygraph/pp_humanseg_v2/images.zip
 unzip images.zip
 
 # 推理
-python3 infer.py --model_file ./human_pp_humansegv2_lite_192x192_pretrained_3588/human_pp_humansegv2_lite_192x192_pretrained_3588.rknn \
-                --config_file ./human_pp_humansegv2_lite_192x192_pretrained_3588/deploy.yaml \
+python3 infer.py --model_file ./Portrait_PP_HumanSegV2_Lite_256x144_infer/Portrait_PP_HumanSegV2_Lite_256x144_infer_rk3588.rknn \
+                --config_file ./Portrait_PP_HumanSegV2_Lite_256x144_infer/deploy.yaml \
                 --image images/portrait_heng.jpg
 ```
 
diff --git a/examples/vision/segmentation/paddleseg/rknpu2/python/infer.py b/examples/vision/segmentation/paddleseg/rknpu2/python/infer.py
index 2b6034a33e..8841132a99 100644
--- a/examples/vision/segmentation/paddleseg/rknpu2/python/infer.py
+++ b/examples/vision/segmentation/paddleseg/rknpu2/python/infer.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import fastdeploy as fd
 import cv2
 import os
@@ -30,7 +43,11 @@ model_file = args.model_file
 params_file = ""
 config_file = args.config_file
 model = fd.vision.segmentation.PaddleSegModel(
-    model_file, params_file, config_file, runtime_option=runtime_option,model_format=fd.ModelFormat.RKNN)
+    model_file,
+    params_file,
+    config_file,
+    runtime_option=runtime_option,
+    model_format=fd.ModelFormat.RKNN)
 
 model.disable_normalize_and_permute()
 
diff --git a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc
index f488a50a6d..2f0618dbe6 100644
--- a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc
+++ b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc
@@ -15,11 +15,27 @@
 
 namespace fastdeploy {
 RKNPU2Backend::~RKNPU2Backend() {
-  if (input_attrs != nullptr) {
-    free(input_attrs);
+  // Release memory uniformly here
+  if (input_attrs_ != nullptr) {
+    free(input_attrs_);
   }
-  if (output_attrs != nullptr) {
-    free(output_attrs);
+
+  if (output_attrs_ != nullptr) {
+    free(output_attrs_);
+  }
+
+  for (uint32_t i = 0; i < io_num.n_input; i++) {
+    rknn_destroy_mem(ctx, input_mems_[i]);
+  }
+  if(input_mems_ != nullptr){
+    free(input_mems_);
+  }
+
+  for (uint32_t i = 0; i < io_num.n_output; i++) {
+    rknn_destroy_mem(ctx, output_mems_[i]);
+  }
+  if(output_mems_ != nullptr){
+    free(output_mems_);
   }
 }
 /***************************************************************
@@ -150,56 +166,85 @@ bool RKNPU2Backend::GetModelInputOutputInfos() {
   }
 
   // Get detailed input parameters
-  input_attrs =
+  input_attrs_ =
       (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_input);
-  memset(input_attrs, 0, io_num.n_input * sizeof(rknn_tensor_attr));
+  memset(input_attrs_, 0, io_num.n_input * sizeof(rknn_tensor_attr));
   inputs_desc_.resize(io_num.n_input);
+
+  // create input tensor memory
+  // rknn_tensor_mem* input_mems[io_num.n_input];
+  input_mems_ = (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_input);
+
+  // get input info and copy to input tensor info
   for (uint32_t i = 0; i < io_num.n_input; i++) {
-    input_attrs[i].index = i;
+    input_attrs_[i].index = i;
     // query info
-    ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]),
+    ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs_[i]),
                      sizeof(rknn_tensor_attr));
     if (ret != RKNN_SUCC) {
       printf("rknn_init error! ret=%d\n", ret);
       return false;
     }
-    std::string temp_name = input_attrs[i].name;
-    std::vector<int> temp_shape{};
-    temp_shape.resize(input_attrs[i].n_dims);
-    for (int j = 0; j < input_attrs[i].n_dims; j++) {
-      temp_shape[j] = (int)input_attrs[i].dims[j];
+    if((input_attrs_[i].fmt != RKNN_TENSOR_NHWC) &&
+        (input_attrs_[i].fmt != RKNN_TENSOR_UNDEFINED)){
+      FDERROR << "rknpu2_backend only support input format is NHWC or UNDEFINED" << std::endl;
     }
 
+    // copy input_attrs_ to input tensor info
+    std::string temp_name = input_attrs_[i].name;
+    std::vector<int> temp_shape{};
+    temp_shape.resize(input_attrs_[i].n_dims);
+    for (int j = 0; j < input_attrs_[i].n_dims; j++) {
+      temp_shape[j] = (int)input_attrs_[i].dims[j];
+    }
     FDDataType temp_dtype =
         fastdeploy::RKNPU2Backend::RknnTensorTypeToFDDataType(
-            input_attrs[i].type);
+            input_attrs_[i].type);
     TensorInfo temp_input_info = {temp_name, temp_shape, temp_dtype};
     inputs_desc_[i] = temp_input_info;
   }
 
   // Get detailed output parameters
-  output_attrs =
+  output_attrs_ =
       (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_output);
-  memset(output_attrs, 0, io_num.n_output * sizeof(rknn_tensor_attr));
+  memset(output_attrs_, 0, io_num.n_output * sizeof(rknn_tensor_attr));
   outputs_desc_.resize(io_num.n_output);
+
+  // Create output tensor memory
+  output_mems_ = (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_output);;
+
   for (uint32_t i = 0; i < io_num.n_output; i++) {
-    output_attrs[i].index = i;
+    output_attrs_[i].index = i;
     // query info
-    ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]),
+    ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs_[i]),
                      sizeof(rknn_tensor_attr));
     if (ret != RKNN_SUCC) {
       FDERROR << "rknn_query fail! ret = " << ret << std::endl;
       return false;
     }
-    std::string temp_name = output_attrs[i].name;
-    std::vector<int> temp_shape{};
-    temp_shape.resize(output_attrs[i].n_dims);
-    for (int j = 0; j < output_attrs[i].n_dims; j++) {
-      temp_shape[j] = (int)output_attrs[i].dims[j];
+
+    // If the output dimension is 3, the runtime will automatically change it to 4. 
+    // Obviously, this is wrong, and manual correction is required here.
+    int n_dims = output_attrs_[i].n_dims;
+    if((n_dims == 4) && (output_attrs_[i].dims[3] == 1)){
+      n_dims--;
+      FDWARNING << "The output[" 
+                << i
+                << "].shape[3] is 1, remove this dim." 
+                << std::endl;
     }
+
+    // copy output_attrs_ to output tensor
+    std::string temp_name = output_attrs_[i].name;
+    std::vector<int> temp_shape{};
+    temp_shape.resize(n_dims);
+    for (int j = 0; j < n_dims; j++) {
+      temp_shape[j] = (int)output_attrs_[i].dims[j];
+    }
+
     FDDataType temp_dtype =
         fastdeploy::RKNPU2Backend::RknnTensorTypeToFDDataType(
-            output_attrs[i].type);
+            output_attrs_[i].type);
     TensorInfo temp_input_info = {temp_name, temp_shape, temp_dtype};
     outputs_desc_[i] = temp_input_info;
   }
@@ -254,82 +299,77 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
     return false;
   }
 
-  // the input size only can be one
-  if (inputs.size() > 1) {
-    FDERROR << "[RKNPU2Backend] Size of the inputs only support 1."
-            << std::endl;
-    return false;
+  if(!this->infer_init){
+    for (uint32_t i = 0; i < io_num.n_input; i++) {
+      // Judge whether the input and output types are the same
+      rknn_tensor_type input_type =
+          fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(inputs[i].dtype);
+      if (input_type != input_attrs_[i].type) {
+        FDWARNING << "The input tensor type != model's inputs type."
+                  << "The input_type need " << get_type_string(input_attrs_[i].type)
+                  << ",but inputs["<< i << "].type is " << get_type_string(input_type)
+                  << std::endl;
+      }
+
+      // Create input tensor memory
+      input_attrs_[i].type = input_type;
+      input_attrs_[i].size = inputs[0].Nbytes();
+      input_attrs_[i].size_with_stride = inputs[0].Nbytes();
+      input_attrs_[i].pass_through = 0;
+      input_mems_[i] = rknn_create_mem(ctx, inputs[i].Nbytes());
+      if (input_mems_[i] == nullptr) {
+        FDERROR << "rknn_create_mem input_mems_ error." << std::endl;
+        return false;
+      }
+
+      // Set input tensor memory
+      ret = rknn_set_io_mem(ctx, input_mems_[i], &input_attrs_[i]);
+      if (ret != RKNN_SUCC) {
+        FDERROR << "input tensor memory rknn_set_io_mem fail! ret=" << ret
+                << std::endl;
+        return false;
+      }
+    }
+
+    for (uint32_t i = 0; i < io_num.n_output; ++i) {
+      // Most post-processing does not support the fp16 format.
+      // The unified output here is float32
+      uint32_t output_size = output_attrs_[i].n_elems * sizeof(float);
+      output_mems_[i] = rknn_create_mem(ctx, output_size);
+      if (output_mems_[i] == nullptr) {
+        FDERROR << "rknn_create_mem output_mems_ error." << std::endl;
+        return false;
+      }
+      // default output type is depend on model, this requires float32 to compute top5
+      output_attrs_[i].type = RKNN_TENSOR_FLOAT32;
+      ret = rknn_set_io_mem(ctx, output_mems_[i], &output_attrs_[i]);
+      // set output memory and attribute
+      if (ret != RKNN_SUCC) {
+        FDERROR << "output tensor memory rknn_set_io_mem fail! ret=" << ret
+                << std::endl;
+        return false;
+      }
+    }
+
+    this->infer_init = true;
   }
-
-  // Judge whether the input and output types are the same
-  rknn_tensor_type input_type =
-      fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(inputs[0].dtype);
-  if (input_type != input_attrs[0].type) {
-    FDWARNING << "The input tensor type != model's inputs type."
-              << "The input_type need " << get_type_string(input_attrs[0].type)
-              << ",but inputs[0].type is " << get_type_string(input_type)
-              << std::endl;
-  }
-
-  rknn_tensor_format input_layout =
-      RKNN_TENSOR_NHWC; // RK3588 only support NHWC
-  input_attrs[0].type = input_type;
-  input_attrs[0].fmt = input_layout;
-  input_attrs[0].size = inputs[0].Nbytes();
-  input_attrs[0].size_with_stride = inputs[0].Nbytes();
-  input_attrs[0].pass_through = 0;
-
-  // create input tensor memory
-  rknn_tensor_mem* input_mems[1];
-  input_mems[0] = rknn_create_mem(ctx, inputs[0].Nbytes());
-  if (input_mems[0] == nullptr) {
-    FDERROR << "rknn_create_mem input_mems error." << std::endl;
-    return false;
-  }
-
+  
   // Copy input data to input tensor memory
-  uint32_t width = input_attrs[0].dims[2];
-  uint32_t stride = input_attrs[0].w_stride;
-  if (width == stride) {
-    if (inputs[0].Data() == nullptr) {
-      FDERROR << "inputs[0].Data is NULL." << std::endl;
-      return false;
-    }
-    memcpy(input_mems[0]->virt_addr, inputs[0].Data(), inputs[0].Nbytes());
-  } else {
-    FDERROR << "[RKNPU2Backend] only support width == stride." << std::endl;
-    return false;
-  }
-
-  // Create output tensor memory
-  rknn_tensor_mem* output_mems[io_num.n_output];
-  for (uint32_t i = 0; i < io_num.n_output; ++i) {
-    // Most post-processing does not support the fp16 format.
-    // The unified output here is float32
-    uint32_t output_size = output_attrs[i].n_elems * sizeof(float);
-    output_mems[i] = rknn_create_mem(ctx, output_size);
-  }
-
-  // Set input tensor memory
-  ret = rknn_set_io_mem(ctx, input_mems[0], &input_attrs[0]);
-  if (ret != RKNN_SUCC) {
-    FDERROR << "input tensor memory rknn_set_io_mem fail! ret=" << ret
-            << std::endl;
-    return false;
-  }
-
-  // Set output tensor memory
-  for (uint32_t i = 0; i < io_num.n_output; ++i) {
-    // default output type is depend on model, this requires float32 to compute top5
-    output_attrs[i].type = RKNN_TENSOR_FLOAT32;
-    ret = rknn_set_io_mem(ctx, output_mems[i], &output_attrs[i]);
-    // set output memory and attribute
-    if (ret != RKNN_SUCC) {
-      FDERROR << "output tensor memory rknn_set_io_mem fail! ret=" << ret
-              << std::endl;
+  for (uint32_t i = 0; i < io_num.n_input; i++) {
+    uint32_t width = input_attrs_[i].dims[2];
+    uint32_t stride = input_attrs_[i].w_stride;
+    if (width == stride) {
+      if (inputs[i].Data() == nullptr) {
+        FDERROR << "inputs[0].Data is NULL." << std::endl;
+        return false;
+      }
+      memcpy(input_mems_[i]->virt_addr, inputs[i].Data(), inputs[i].Nbytes());
+    } else {
+      FDERROR << "[RKNPU2Backend] only support width == stride." << std::endl;
       return false;
     }
   }
+  
 
   // run rknn
   ret = rknn_run(ctx, nullptr);
@@ -337,7 +377,6 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
     FDERROR << "rknn run error! ret=" << ret << std::endl;
     return false;
   }
-  rknn_destroy_mem(ctx, input_mems[0]);
 
   // get result
   outputs->resize(outputs_desc_.size());
@@ -349,9 +388,8 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
     }
     (*outputs)[i].Resize(temp_shape, outputs_desc_[i].dtype,
                          outputs_desc_[i].name);
-    memcpy((*outputs)[i].MutableData(), (float*)output_mems[i]->virt_addr,
+    memcpy((*outputs)[i].MutableData(), (float*)output_mems_[i]->virt_addr,
            (*outputs)[i].Nbytes());
-    rknn_destroy_mem(ctx, output_mems[i]);
   }
 
   return true;
diff --git a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h
index 643d600068..1aba24ec3b 100644
--- a/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h
+++ b/fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h
@@ -86,8 +86,13 @@ class RKNPU2Backend : public BaseBackend {
   std::vector<TensorInfo> inputs_desc_;
   std::vector<TensorInfo> outputs_desc_;
 
-  rknn_tensor_attr* input_attrs = nullptr;
-  rknn_tensor_attr* output_attrs = nullptr;
+  rknn_tensor_attr* input_attrs_ = nullptr;
+  rknn_tensor_attr* output_attrs_ = nullptr;
+
+  rknn_tensor_mem** input_mems_;
+  rknn_tensor_mem** output_mems_;
+
+  bool infer_init = false;
 
   RKNPU2BackendOption option_;
 
diff --git a/tools/rknpu2/config/RK3568/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml b/tools/rknpu2/config/RK3568/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
new file mode 100644
index 0000000000..d78a46a665
--- /dev/null
+++ b/tools/rknpu2/config/RK3568/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
@@ -0,0 +1,7 @@
+model_path: ./Portrait_PP_HumanSegV2_Lite_256x144_infer/Portrait_PP_HumanSegV2_Lite_256x144_infer.onnx
+output_folder: ./Portrait_PP_HumanSegV2_Lite_256x144_infer
+target_platform: RK3568
+normalize:
+  mean: [[0.5,0.5,0.5]]
+  std: [[0.5,0.5,0.5]]
+outputs: None
diff --git a/tools/rknpu2/config/RK3588/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml b/tools/rknpu2/config/RK3588/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
new file mode 100644
index 0000000000..98fe9c6b5d
--- /dev/null
+++ b/tools/rknpu2/config/RK3588/Portrait_PP_HumanSegV2_Lite_256x144_infer.yaml
@@ -0,0 +1,7 @@
+model_path: ./Portrait_PP_HumanSegV2_Lite_256x144_infer/Portrait_PP_HumanSegV2_Lite_256x144_infer.onnx
+output_folder: ./Portrait_PP_HumanSegV2_Lite_256x144_infer
+target_platform: RK3588
+normalize:
+  mean: [[0.5,0.5,0.5]]
+  std: [[0.5,0.5,0.5]]
+outputs: None
diff --git a/tools/rknpu2/config/ppseg_config.yaml b/tools/rknpu2/config/ppseg_config.yaml
deleted file mode 100644
index a029bdb95f..0000000000
--- a/tools/rknpu2/config/ppseg_config.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-model_path: ./portrait_pp_humansegv2_lite_256x144_pretrained.onnx
-output_folder: ./
-target_platform: RK3588
-normalize:
-  mean: [0.5,0.5,0.5]
-  std: [0.5,0.5,0.5]
-outputs: None
diff --git a/tools/rknpu2/export.py b/tools/rknpu2/export.py
index 12dfe18c96..ee43a18091 100644
--- a/tools/rknpu2/export.py
+++ b/tools/rknpu2/export.py
@@ -2,10 +2,6 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
@@ -38,11 +34,17 @@ if __name__ == "__main__":
     model = RKNN(config.verbose)
 
     # Config
-    mean_values = [[255 * mean for mean in yaml_config["normalize"]["mean"]]]
-    std_values = [[255 * std for std in yaml_config["normalize"]["std"]]]
-    model.config(mean_values=mean_values,
-                 std_values=std_values,
-                 target_platform=yaml_config["target_platform"])
+    if yaml_config["normalize"] == "None":
+        model.config(target_platform=yaml_config["target_platform"])
+    else:
+        mean_values = [[256 * mean for mean in mean_ls]
+                       for mean_ls in yaml_config["normalize"]["mean"]]
+        std_values = [[256 * std for std in std_ls]
+                      for std_ls in yaml_config["normalize"]["std"]]
+        model.config(
+            mean_values=mean_values,
+            std_values=std_values,
+            target_platform=yaml_config["target_platform"])
 
     # Load ONNX model
     print(type(yaml_config["outputs"]))
@@ -50,8 +52,8 @@ if __name__ == "__main__":
     if yaml_config["outputs"] == "None":
         ret = model.load_onnx(model=yaml_config["model_path"])
     else:
-        ret = model.load_onnx(model=yaml_config["model_path"],
-                              outputs=yaml_config["outputs"])
+        ret = model.load_onnx(
+            model=yaml_config["model_path"], outputs=yaml_config["outputs"])
     assert ret == 0, "Load model failed!"
 
     # Build model

From 7150e6405c29c809c24fa22b11eac4271426791b Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Fri, 4 Nov 2022 11:00:35 +0800
Subject: [PATCH 13/18] [Model] Add FSANet model (#448)

* add yolov5cls

* fixed bugs

* fixed bugs

* fixed preprocess bug

* add yolov5cls readme

* deal with comments

* Add YOLOv5Cls Note

* add yolov5cls test

* add rvm support

* support rvm model

* add rvm demo

* fixed bugs

* add rvm readme

* add TRT support

* add trt support

* add rvm test

* add EXPORT.md

* rename export.md

* rm poros doxyen

* deal with comments

* deal with comments

* add rvm video_mode note

* add fsanet

* fixed bug

* update readme

* fixed for ci

* deal with comments

* deal with comments

* deal with comments

Co-authored-by: Jason <jiangjiajun@baidu.com>
Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
---
 docs/api/vision_results/README.md             |   1 +
 docs/api/vision_results/headpose_result.md    |  25 ++++
 examples/CMakeLists.txt                       |   2 +-
 examples/vision/README.md                     |  24 ++--
 .../vision/facealign/pfld/cpp/CMakeLists.txt  |   6 +-
 .../vision/facealign/pfld/python/README.md    |   1 -
 .../vision/facealign/pfld/python/infer.py     |   6 +-
 examples/vision/headpose/README.md            |   7 +
 examples/vision/headpose/fsanet/README.md     |  25 ++++
 .../vision/headpose/fsanet/cpp/CMakeLists.txt |  18 +++
 examples/vision/headpose/fsanet/cpp/README.md |  74 ++++++++++
 examples/vision/headpose/fsanet/cpp/infer.cc  | 110 +++++++++++++++
 .../vision/headpose/fsanet/python/README.md   |  67 +++++++++
 .../vision/headpose/fsanet/python/infer.py    |  88 ++++++++++++
 fastdeploy/vision.h                           |   1 +
 fastdeploy/vision/common/result.cc            |  23 +++
 fastdeploy/vision/common/result.h             |  22 ++-
 fastdeploy/vision/headpose/contrib/fsanet.cc  | 132 ++++++++++++++++++
 fastdeploy/vision/headpose/contrib/fsanet.h   |  64 +++++++++
 .../vision/headpose/contrib/fsanet_pybind.cc  |  31 ++++
 fastdeploy/vision/headpose/headpose_pybind.cc |  25 ++++
 .../vision/tracking/pptracking/trajectory.h   |   0
 fastdeploy/vision/vision_pybind.cc            |  11 +-
 fastdeploy/vision/visualize/headpose.cc       |  59 ++++++++
 fastdeploy/vision/visualize/visualize.h       |   8 +-
 .../vision/visualize/visualize_pybind.cc      |  10 ++
 python/fastdeploy/vision/__init__.py          |   1 +
 python/fastdeploy/vision/headpose/__init__.py |  16 +++
 .../vision/headpose/contrib/__init__.py       |  15 ++
 .../vision/headpose/contrib/fsanet.py         |  68 +++++++++
 .../fastdeploy/vision/visualize/__init__.py   |   4 +
 31 files changed, 922 insertions(+), 22 deletions(-)
 create mode 100644 docs/api/vision_results/headpose_result.md
 mode change 100644 => 100755 examples/CMakeLists.txt
 mode change 100644 => 100755 examples/vision/README.md
 mode change 100644 => 100755 examples/vision/facealign/pfld/cpp/CMakeLists.txt
 mode change 100644 => 100755 examples/vision/facealign/pfld/python/README.md
 mode change 100644 => 100755 examples/vision/facealign/pfld/python/infer.py
 create mode 100644 examples/vision/headpose/README.md
 create mode 100644 examples/vision/headpose/fsanet/README.md
 create mode 100644 examples/vision/headpose/fsanet/cpp/CMakeLists.txt
 create mode 100644 examples/vision/headpose/fsanet/cpp/README.md
 create mode 100644 examples/vision/headpose/fsanet/cpp/infer.cc
 create mode 100644 examples/vision/headpose/fsanet/python/README.md
 create mode 100644 examples/vision/headpose/fsanet/python/infer.py
 mode change 100644 => 100755 fastdeploy/vision/common/result.cc
 create mode 100644 fastdeploy/vision/headpose/contrib/fsanet.cc
 create mode 100644 fastdeploy/vision/headpose/contrib/fsanet.h
 create mode 100644 fastdeploy/vision/headpose/contrib/fsanet_pybind.cc
 create mode 100644 fastdeploy/vision/headpose/headpose_pybind.cc
 mode change 100644 => 100755 fastdeploy/vision/tracking/pptracking/trajectory.h
 mode change 100644 => 100755 fastdeploy/vision/vision_pybind.cc
 create mode 100644 fastdeploy/vision/visualize/headpose.cc
 mode change 100644 => 100755 fastdeploy/vision/visualize/visualize.h
 mode change 100644 => 100755 fastdeploy/vision/visualize/visualize_pybind.cc
 mode change 100644 => 100755 python/fastdeploy/vision/__init__.py
 create mode 100644 python/fastdeploy/vision/headpose/__init__.py
 create mode 100644 python/fastdeploy/vision/headpose/contrib/__init__.py
 create mode 100644 python/fastdeploy/vision/headpose/contrib/fsanet.py

diff --git a/docs/api/vision_results/README.md b/docs/api/vision_results/README.md
index 9b2dc50839..62a8e2eb60 100755
--- a/docs/api/vision_results/README.md
+++ b/docs/api/vision_results/README.md
@@ -14,3 +14,4 @@ FastDeploy根据视觉模型的任务类型，定义了不同的结构体(`fastd
 | MattingResult           | [C++/Python文档](./matting_result.md)           | 图片/视频抠图返回结果      | MODNet、RVM系列模型等         |
 | OCRResult               | [C++/Python文档](./ocr_result.md)               | 文本框检测，分类和文本识别返回结果 | OCR系列模型等                |
 | MOTResult               | [C++/Python文档](./mot_result.md)               | 多目标跟踪返回结果         | pptracking系列模型等         |
+| HeadPoseResult               | [C++/Python文档](./headpose_result.md)               | 头部姿态估计返回结果         | FSANet系列模型等         |
diff --git a/docs/api/vision_results/headpose_result.md b/docs/api/vision_results/headpose_result.md
new file mode 100644
index 0000000000..d1daa84cc5
--- /dev/null
+++ b/docs/api/vision_results/headpose_result.md
@@ -0,0 +1,25 @@
+# HeadPoseResult 头部姿态结果
+
+HeadPoseResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明头部姿态结果。
+
+## C++ 定义
+
+`fastdeploy::vision::HeadPoseResult`
+
+```c++
+struct HeadPoseResult {
+  std::vector<float> euler_angles;
+  void Clear();
+  std::string Str();
+};
+```
+
+- **euler_angles**: 成员变量，表示单张人脸图片预测的欧拉角，存放的顺序是(yaw, pitch, roll)， yaw 代表水平转角，pitch 代表垂直角，roll 代表翻滚角，值域都为 [-90,+90]度
+- **Clear()**: 成员函数，用于清除结构体中存储的结果
+- **Str()**: 成员函数，将结构体中的信息以字符串形式输出（用于Debug）
+
+## Python 定义
+
+`fastdeploy.vision.HeadPoseResult`
+
+- **euler_angles**(list of float): 成员变量，表示单张人脸图片预测的欧拉角，存放的顺序是(yaw, pitch, roll)， yaw 代表水平转角，pitch 代表垂直角，roll 代表翻滚角，值域都为 [-90,+90]度
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
old mode 100644
new mode 100755
index e0d99a30ab..7118460ea6
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -49,7 +49,7 @@ function(add_fastdeploy_executable FIELD CC_FILE)
     add_executable(${TEMP_TARGET_NAME} ${TEMP_TARGET_FILE})
     target_link_libraries(${TEMP_TARGET_NAME} PUBLIC fastdeploy)
     if(TARGET gflags)
-      if(NOT ANDROID)
+      if(UNIX)
         target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags pthread)
       else()
         target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags)
diff --git a/examples/vision/README.md b/examples/vision/README.md
old mode 100644
new mode 100755
index 03cdf7f404..f439d6e721
--- a/examples/vision/README.md
+++ b/examples/vision/README.md
@@ -2,17 +2,19 @@
 
 本目录下提供了各类视觉模型的部署，主要涵盖以下任务类型
 
-| 任务类型              | 说明                                              | 预测结果结构体                                                                              |
-|:------------------|:------------------------------------------------|:-------------------------------------------------------------------------------------|
-| Detection         | 目标检测，输入图像，检测图像中物体位置，并返回检测框坐标及类别和置信度             | [DetectionResult](../../docs/api/vision_results/detection_result.md)                 |
-| Segmentation      | 语义分割，输入图像，给出图像中每个像素的分类及置信度                      | [SegmentationResult](../../docs/api/vision_results/segmentation_result.md)           |
-| Classification    | 图像分类，输入图像，给出图像的分类结果和置信度                         | [ClassifyResult](../../docs/api/vision_results/classification_result.md)             |
-| FaceDetection     | 人脸检测，输入图像，检测图像中人脸位置，并返回检测框坐标及人脸关键点              | [FaceDetectionResult](../../docs/api/vision_results/face_detection_result.md)        |
-| KeypointDetection | 关键点检测，输入图像，返回图像中人物行为的各个关键点坐标和置信度                | [KeyPointDetectionResult](../../docs/api/vision_results/keypointdetection_result.md) |
-| FaceRecognition   | 人脸识别，输入图像，返回可用于相似度计算的人脸特征的embedding             | [FaceRecognitionResult](../../docs/api/vision_results/face_recognition_result.md)    |
-| Matting           | 抠图，输入图像，返回图片的前景每个像素点的Alpha值                     | [MattingResult](../../docs/api/vision_results/matting_result.md)                     |
-| OCR               | 文本框检测，分类，文本框内容识别，输入图像，返回文本框坐标，文本框的方向类别以及框内的文本内容 | [OCRResult](../../docs/api/vision_results/ocr_result.md)                             |
-| MOT               | 多目标跟踪，输入图像，检测图像中物体位置，并返回检测框坐标，对象id及类别置信度        | [MOTResult](../../docs/api/vision_results/mot_result.md)                             |
+| 任务类型           | 说明                                  | 预测结果结构体                                                                          |
+|:-------------- |:----------------------------------- |:-------------------------------------------------------------------------------- |
+| Detection      | 目标检测，输入图像，检测图像中物体位置，并返回检测框坐标及类别和置信度 | [DetectionResult](../../docs/api/vision_results/detection_result.md)       |
+| Segmentation   | 语义分割，输入图像，给出图像中每个像素的分类及置信度          | [SegmentationResult](../../docs/api/vision_results/segmentation_result.md) |
+| Classification | 图像分类，输入图像，给出图像的分类结果和置信度             | [ClassifyResult](../../docs/api/vision_results/classification_result.md)   |
+| FaceDetection | 人脸检测，输入图像，检测图像中人脸位置，并返回检测框坐标及人脸关键点             | [FaceDetectionResult](../../docs/api/vision_results/face_detection_result.md)   |
+| FaceAlignment |  人脸对齐(人脸关键点检测)，输入图像，返回人脸关键点            | [FaceAlignmentResult](../../docs/api/vision_results/face_alignment_result.md)   |
+| KeypointDetection   | 关键点检测，输入图像，返回图像中人物行为的各个关键点坐标和置信度         | [KeyPointDetectionResult](../../docs/api/vision_results/keypointdetection_result.md) |
+| FaceRecognition | 人脸识别，输入图像，返回可用于相似度计算的人脸特征的embedding            | [FaceRecognitionResult](../../docs/api/vision_results/face_recognition_result.md)   |
+| Matting | 抠图，输入图像，返回图片的前景每个像素点的Alpha值            | [MattingResult](../../docs/api/vision_results/matting_result.md)   |
+| OCR | 文本框检测，分类，文本框内容识别，输入图像，返回文本框坐标，文本框的方向类别以及框内的文本内容            | [OCRResult](../../docs/api/vision_results/ocr_result.md)   |
+| MOT | 多目标跟踪，输入图像，检测图像中物体位置，并返回检测框坐标，对象id及类别置信度        | [MOTResult](../../docs/api/vision_results/mot_result.md)   |
+| HeadPose | 头部姿态估计，返回头部欧拉角            | [HeadPoseResult](../../docs/api/vision_results/headpose_result.md)   |
 
 ## FastDeploy API设计
 
diff --git a/examples/vision/facealign/pfld/cpp/CMakeLists.txt b/examples/vision/facealign/pfld/cpp/CMakeLists.txt
old mode 100644
new mode 100755
index c6c754a4b5..be329f69ac
--- a/examples/vision/facealign/pfld/cpp/CMakeLists.txt
+++ b/examples/vision/facealign/pfld/cpp/CMakeLists.txt
@@ -11,4 +11,8 @@ include_directories(${FASTDEPLOY_INCS})
 
 add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
 # 添加FastDeploy库依赖
-target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
+if(UNIX)
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
+else()
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags)
+endif()
diff --git a/examples/vision/facealign/pfld/python/README.md b/examples/vision/facealign/pfld/python/README.md
old mode 100644
new mode 100755
index e9fdf545e4..d68e2d083b
--- a/examples/vision/facealign/pfld/python/README.md
+++ b/examples/vision/facealign/pfld/python/README.md
@@ -16,7 +16,6 @@ cd FastDeploy/examples/vision/facealign/pfld/python
 ## 原版ONNX模型
 wget https://bj.bcebos.com/paddlehub/fastdeploy/pfld-106-lite.onnx
 wget https://bj.bcebos.com/paddlehub/fastdeploy/facealign_input.png
-
 # CPU推理
 python infer.py --model pfld-106-lite.onnx --image facealign_input.png --device cpu
 # GPU推理
diff --git a/examples/vision/facealign/pfld/python/infer.py b/examples/vision/facealign/pfld/python/infer.py
old mode 100644
new mode 100755
index d185cb5321..6eb5b720b8
--- a/examples/vision/facealign/pfld/python/infer.py
+++ b/examples/vision/facealign/pfld/python/infer.py
@@ -17,11 +17,11 @@ def parse_arguments():
     parser.add_argument(
         "--backend",
         type=str,
-        default="ort",
-        help="inference backend, ort, ov, trt, paddle, paddle_trt.")
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
     parser.add_argument(
         "--enable_trt_fp16",
-        type=bool,
+        type=ast.literal_eval,
         default=False,
         help="whether enable fp16 in trt/paddle_trt backend")
     return parser.parse_args()
diff --git a/examples/vision/headpose/README.md b/examples/vision/headpose/README.md
new file mode 100644
index 0000000000..d4be67871c
--- /dev/null
+++ b/examples/vision/headpose/README.md
@@ -0,0 +1,7 @@
+# 头部姿态模型
+
+FastDeploy目前支持如下人脸对齐模型部署
+
+| 模型 | 说明 | 模型格式 | 版本 |
+| :--- | :--- | :------- | :--- |
+| [omasaht/headpose-fsanet-pytorch](./fsanet) | FSANet 系列模型 | ONNX | [CommitID:002549c](https://github.com/omasaht/headpose-fsanet-pytorch/commit/002549c) |
diff --git a/examples/vision/headpose/fsanet/README.md b/examples/vision/headpose/fsanet/README.md
new file mode 100644
index 0000000000..8cddca2cc5
--- /dev/null
+++ b/examples/vision/headpose/fsanet/README.md
@@ -0,0 +1,25 @@
+# FSANet 模型部署
+
+## 模型版本说明
+
+- [FSANet](https://github.com/omasaht/headpose-fsanet-pytorch/commit/002549c)
+
+## 支持模型列表
+
+目前FastDeploy支持如下模型的部署
+
+- [FSANet 模型](https://github.com/omasaht/headpose-fsanet-pytorch)
+
+## 下载预训练模型
+
+为了方便开发者的测试，下面提供了PFLD导出的各系列模型，开发者可直接下载使用。
+
+| 模型                                                               | 参数大小    | 精度    | 备注 |
+|:---------------------------------------------------------------- |:----- |:----- | :------ |
+| [fsanet-1x1.onnx](https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-1x1.onnx) | 1.2M | - |
+| [fsanet-var.onnx](https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-var.onnx) | 1.2MB | - |
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
diff --git a/examples/vision/headpose/fsanet/cpp/CMakeLists.txt b/examples/vision/headpose/fsanet/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..be329f69ac
--- /dev/null
+++ b/examples/vision/headpose/fsanet/cpp/CMakeLists.txt
@@ -0,0 +1,18 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+include(${FASTDEPLOY_INSTALL_DIR}/utils/gflags.cmake)
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
+# 添加FastDeploy库依赖
+if(UNIX)
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
+else()
+  target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags)
+endif()
diff --git a/examples/vision/headpose/fsanet/cpp/README.md b/examples/vision/headpose/fsanet/cpp/README.md
new file mode 100644
index 0000000000..9fc719192b
--- /dev/null
+++ b/examples/vision/headpose/fsanet/cpp/README.md
@@ -0,0 +1,74 @@
+# FSANet C++部署示例
+
+本目录下提供`infer.cc`快速完成FSANet在CPU/GPU，以及GPU上通过TensorRT加速部署的示例。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上CPU推理为例，在本目录执行如下命令即可完成编译测试，保证 FastDeploy 版本0.6.0以上(x.x.x >= 0.6.0)支持FSANet模型
+
+```bash
+mkdir build
+cd build
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+#下载官方转换好的 FSANet 模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-var.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/headpose_input.png
+# CPU推理
+./infer_demo --model fsanet-var.onnx --image headpose_input.png --device cpu
+# GPU推理
+./infer_demo --model fsanet-var.onnx --image headpose_input.png --device gpu
+# GPU上TensorRT推理
+./infer_demo --model fsanet-var.onnx --image headpose_input.png --device gpu --backend trt
+```
+
+运行完成可视化结果如下图所示
+
+<div width="520">
+<img width="500" height="514" float="left" src="https://user-images.githubusercontent.com/19977378/198279932-3eee424e-98a2-4249-bdeb-0f79127cbc9d.png">
+</div>
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## FSANet C++接口
+
+### FSANet 类
+
+```c++
+fastdeploy::vision::headpose::FSANet(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+FSANet模型加载和初始化，其中model_file为导出的ONNX模型格式。
+**参数**
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX格式
+#### Predict函数
+> ```c++
+> FSANet::Predict(cv::Mat* im, HeadPoseResult* result)
+> ```
+>
+> 模型预测接口，输入图像直接输出头部姿态预测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 头部姿态预测结果, HeadPoseResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/)
+### 类成员变量
+用户可按照自己的实际需求，修改下列预处理参数，从而影响最终的推理和部署效果
+> > * **size**(vector&lt;int&gt;): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112]
+- [模型介绍](../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../docs/api/vision_results/)
+- [如何切换模型推理后端引擎](../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/headpose/fsanet/cpp/infer.cc b/examples/vision/headpose/fsanet/cpp/infer.cc
new file mode 100644
index 0000000000..332f492606
--- /dev/null
+++ b/examples/vision/headpose/fsanet/cpp/infer.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#include "gflags/gflags.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(image, "", "Path of the image file.");
+DEFINE_string(device, "cpu",
+              "Type of inference device, support 'cpu' or 'gpu'.");
+DEFINE_string(backend, "default",
+              "The inference runtime backend, support: ['default', 'ort', "
+              "'paddle', 'ov', 'trt', 'paddle_trt']");
+DEFINE_bool(use_fp16, false, "Whether to use FP16 mode, only support 'trt' and 'paddle_trt' backend");
+
+void PrintUsage() {
+  std::cout << "Usage: infer_demo --model model_path --image img_path --device [cpu|gpu] --backend "
+               "[default|ort|paddle|ov|trt|paddle_trt] "
+               "--use_fp16 false"
+            << std::endl;
+  std::cout << "Default value of device: cpu" << std::endl;
+  std::cout << "Default value of backend: default" << std::endl;
+  std::cout << "Default value of use_fp16: false" << std::endl;
+}
+
+bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
+  if (FLAGS_device == "gpu") {
+    option->UseGpu();
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleBackend();
+    } else if (FLAGS_backend == "trt" || 
+               FLAGS_backend == "paddle_trt") {
+      option->UseTrtBackend();
+      option.SetTrtInputShape("images", {1, 3, 64, 64});
+      if (FLAGS_backend == "paddle_trt") {
+        option->EnablePaddleToTrt();
+      }
+      if (FLAGS_use_fp16) {
+        option->EnableTrtFP16();
+      }
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAG_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else if (FLAGS_device == "cpu") {
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "ov") {
+      option->UseOpenVINOBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleBackend();
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAG_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else {
+    std::cerr << "Only support device CPU/GPU now, "  << FLAGS_device << " is not supported." << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option)) {
+    PrintUsage();
+    return -1;
+  }
+
+  auto model = fastdeploy::vision::headpose::FSANet(FLAGS_model, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return -1;
+  }
+
+  auto im = cv::imread(FLAGS_image);
+  auto im_bak = im.clone();
+
+  fastdeploy::vision::HeadPoseResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return -1;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisHeadPose(im_bak, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  
+  return 0;
+}
diff --git a/examples/vision/headpose/fsanet/python/README.md b/examples/vision/headpose/fsanet/python/README.md
new file mode 100644
index 0000000000..7863fb1f11
--- /dev/null
+++ b/examples/vision/headpose/fsanet/python/README.md
@@ -0,0 +1,67 @@
+# FSANet Python部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)  
+- 2. FastDeploy Python whl包安装，参考[FastDeploy Python安装](../../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+本目录下提供`infer.py`快速完成FSANet在CPU/GPU，以及GPU上通过TensorRT加速部署的示例，保证 FastDeploy 版本 >= 0.6.0 支持FSANet模型。执行如下脚本即可完成
+
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy/examples/vision/headpose/fsanet/python
+
+# 下载FSANet模型文件和测试图片
+## 原版ONNX模型
+wget https://bj.bcebos.com/paddlehub/fastdeploy/fsanet-var.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/headpose_input.png
+# CPU推理
+python infer.py --model fsanet-var.onnx --image headpose_input.png --device cpu
+# GPU推理
+python infer.py --model fsanet-var.onnx --image headpose_input.png --device gpu
+# TRT推理
+python infer.py --model fsanet-var.onnx --image headpose_input.png --device gpu --backend trt
+```
+
+运行完成可视化结果如下图所示
+
+<div width="520">
+<img width="500" height="514" float="left" src="https://user-images.githubusercontent.com/19977378/198279932-3eee424e-98a2-4249-bdeb-0f79127cbc9d.png">
+</div>
+
+## FSANet Python接口
+
+```python
+fd.vision.headpose.FSANet(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+FSANet 模型加载和初始化，其中model_file为导出的ONNX模型格式
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX格式时，此参数无需设定
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX
+### predict函数
+
+> ```python
+> FSANet.predict(input_image)
+> ```
+>
+> 模型预测结口，输入图像直接输出头部姿态预测结果。
+>
+> **参数**
+>
+> > * **input_image**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+> **返回**
+>
+> > 返回`fastdeploy.vision.HeadPoseResult`结构体，结构体说明参考文档[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+## 其它文档
+
+- [FSANet 模型介绍](..)
+- [FSANet C++部署](../cpp)
+- [模型预测结果说明](../../../../../docs/api/vision_results/)
+- [如何切换模型推理后端引擎](../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/headpose/fsanet/python/infer.py b/examples/vision/headpose/fsanet/python/infer.py
new file mode 100644
index 0000000000..36a74bc091
--- /dev/null
+++ b/examples/vision/headpose/fsanet/python/infer.py
@@ -0,0 +1,88 @@
+import fastdeploy as fd
+import cv2
+import os
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True, help="Path of FSANet model.")
+    parser.add_argument("--image", type=str, help="Path of test image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="default",
+        help="inference backend, default, ort, ov, trt, paddle, paddle_trt.")
+    parser.add_argument(
+        "--enable_trt_fp16",
+        type=ast.literal_eval,
+        default=False,
+        help="whether enable fp16 in trt/paddle_trt backend")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+    device = args.device
+    backend = args.backend
+    enable_trt_fp16 = args.enable_trt_fp16
+    if device == "gpu":
+        option.use_gpu()
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend in ["trt", "paddle_trt"]:
+            option.use_trt_backend()
+            option.set_trt_input_shape("input", [1, 3, 64, 64])
+            if backend == "paddle_trt":
+                option.enable_paddle_to_trt()
+            if enable_trt_fp16:
+                option.enable_trt_fp16()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.".
+                format(backend))
+    elif device == "cpu":
+        if backend == "ort":
+            option.use_ort_backend()
+        elif backend == "ov":
+            option.use_openvino_backend()
+        elif backend == "paddle":
+            option.use_paddle_backend()
+        elif backend == "default":
+            return option
+        else:
+            raise Exception(
+                "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.".
+                format(backend))
+    else:
+        raise Exception(
+            "Only support device CPU/GPU now, {} is not supported.".format(
+                device))
+
+    return option
+
+
+args = parse_arguments()
+
+# 配置runtime，加载模型
+runtime_option = build_option(args)
+model = fd.vision.headpose.FSANet(args.model, runtime_option=runtime_option)
+
+# for image
+im = cv2.imread(args.image)
+result = model.predict(im.copy())
+print(result)
+# 可视化结果
+vis_im = fd.vision.vis_headpose(im, result)
+cv2.imwrite("visualized_result.jpg", vis_im)
+print("Visualized result save in ./visualized_result.jpg")
diff --git a/fastdeploy/vision.h b/fastdeploy/vision.h
index c3f99a6ca6..d9ceb5dda1 100755
--- a/fastdeploy/vision.h
+++ b/fastdeploy/vision.h
@@ -51,6 +51,7 @@
 #include "fastdeploy/vision/ocr/ppocr/recognizer.h"
 #include "fastdeploy/vision/segmentation/ppseg/model.h"
 #include "fastdeploy/vision/tracking/pptracking/model.h"
+#include "fastdeploy/vision/headpose/contrib/fsanet.h"
 #endif
 
 #include "fastdeploy/vision/visualize/visualize.h"
diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc
old mode 100644
new mode 100755
index ea5cdd1496..760acb51d7
--- a/fastdeploy/vision/common/result.cc
+++ b/fastdeploy/vision/common/result.cc
@@ -485,5 +485,28 @@ std::string OCRResult::Str() {
   return no_result;
 }
 
+void HeadPoseResult::Clear() {
+  std::vector<float>().swap(euler_angles);
+}
+
+void HeadPoseResult::Reserve(int size) {
+  euler_angles.resize(size);
+}
+
+void HeadPoseResult::Resize(int size) {
+  euler_angles.resize(size);
+}
+
+std::string HeadPoseResult::Str() {
+  std::string out;
+
+  out = "HeadPoseResult: [yaw, pitch, roll]\n";
+  out = out + "yaw: " + std::to_string(euler_angles[0]) + "\n" +
+        "pitch: " + std::to_string(euler_angles[1]) + "\n" +
+        "roll: " + std::to_string(euler_angles[2]) + "\n";
+  return out;
+}
+
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h
index 1acca31409..771bd62b19 100755
--- a/fastdeploy/vision/common/result.h
+++ b/fastdeploy/vision/common/result.h
@@ -33,7 +33,8 @@ enum FASTDEPLOY_DECL ResultType {
   FACE_RECOGNITION,
   MATTING,
   MASK,
-  KEYPOINT_DETECTION
+  KEYPOINT_DETECTION,
+  HEADPOSE,
 };
 
 struct FASTDEPLOY_DECL BaseResult {
@@ -316,6 +317,25 @@ struct FASTDEPLOY_DECL MattingResult : public BaseResult {
   std::string Str();
 };
 
+/*! @brief HeadPose result structure for all the headpose models
+ */
+struct FASTDEPLOY_DECL HeadPoseResult : public BaseResult {
+  /** \brief EulerAngles for an input image, and the element of `euler_angles` is a vector, contains {yaw, pitch, roll}
+   */
+  std::vector<float> euler_angles;
+
+  ResultType type = ResultType::HEADPOSE;
+  /// Clear headpose result
+  void Clear();
+
+  void Reserve(int size);
+
+  void Resize(int size);
+
+  /// Debug function, convert the result to string to print
+  std::string Str();
+};
+
 
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/headpose/contrib/fsanet.cc b/fastdeploy/vision/headpose/contrib/fsanet.cc
new file mode 100644
index 0000000000..59f25ac5a3
--- /dev/null
+++ b/fastdeploy/vision/headpose/contrib/fsanet.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/headpose/contrib/fsanet.h"
+#include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace headpose {
+
+FSANet::FSANet(const std::string& model_file,
+               const std::string& params_file,
+               const RuntimeOption& custom_option,
+               const ModelFormat& model_format) {
+  if (model_format == ModelFormat::ONNX) {
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT}; 
+    valid_gpu_backends = {Backend::ORT, Backend::TRT}; 
+  } else {
+    valid_cpu_backends = {Backend::PDINFER, Backend::ORT};
+    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+  }
+  runtime_option = custom_option;
+  runtime_option.model_format = model_format;
+  runtime_option.model_file = model_file;
+  runtime_option.params_file = params_file;
+  initialized = Initialize();
+}
+
+bool FSANet::Initialize() {
+  // parameters for preprocess
+  size = {64, 64};
+
+  if (!InitRuntime()) {
+    FDERROR << "Failed to initialize fastdeploy backend." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool FSANet::Preprocess(Mat* mat, FDTensor* output,
+                      std::map<std::string, std::array<int, 2>>* im_info) {
+  // Resize
+  int resize_w = size[0];
+  int resize_h = size[1];
+  if (resize_h != mat->Height() || resize_w != mat->Width()) {
+    Resize::Run(mat, resize_w, resize_h);
+  }
+
+  // Normalize
+  std::vector<float> alpha = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
+  std::vector<float> beta = {-127.5f / 128.0f, -127.5f / 128.0f, -127.5f / 128.0f};
+  Convert::Run(mat, alpha, beta);
+
+  // Record output shape of preprocessed image
+  (*im_info)["output_shape"] = {mat->Height(), mat->Width()};
+
+  HWC2CHW::Run(mat);
+  Cast::Run(mat, "float");
+
+  mat->ShareWithTensor(output);
+  output->shape.insert(output->shape.begin(), 1);  // reshape to n, h, w, c
+  return true;
+}
+
+bool FSANet::Postprocess(FDTensor& infer_result, HeadPoseResult* result,
+                       const std::map<std::string, std::array<int, 2>>& im_info) {
+  FDASSERT(infer_result.shape[0] == 1, "Only support batch = 1 now.");
+  if (infer_result.dtype != FDDataType::FP32) {
+    FDERROR << "Only support post process with float32 data." << std::endl;
+    return false;
+  }
+
+  auto iter_in = im_info.find("input_shape");
+  FDASSERT(iter_in != im_info.end(),
+           "Cannot find input_shape from im_info.");
+  int in_h = iter_in->second[0];
+  int in_w = iter_in->second[1];
+
+  result->Clear();
+  float* data = static_cast<float*>(infer_result.Data());
+  for (size_t i = 0; i < 3; ++i) {
+    result->euler_angles.emplace_back(data[i]);
+  }
+
+  return true;
+}
+
+bool FSANet::Predict(cv::Mat* im, HeadPoseResult* result) {
+  Mat mat(*im);
+  std::vector<FDTensor> input_tensors(1);
+
+  std::map<std::string, std::array<int, 2>> im_info;
+
+  // Record the shape of image and the shape of preprocessed image
+  im_info["input_shape"] = {mat.Height(), mat.Width()};
+  im_info["output_shape"] = {mat.Height(), mat.Width()};
+
+  if (!Preprocess(&mat, &input_tensors[0], &im_info)) {
+    FDERROR << "Failed to preprocess input image." << std::endl;
+    return false;
+  }
+  input_tensors[0].name = InputInfoOfRuntime(0).name;
+  std::vector<FDTensor> output_tensors;
+  if (!Infer(input_tensors, &output_tensors)) {
+    FDERROR << "Failed to inference." << std::endl;
+    return false;
+  }
+
+  if (!Postprocess(output_tensors[0], result, im_info)) {
+    FDERROR << "Failed to post process." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace headpose
+}  // namespace vision
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/vision/headpose/contrib/fsanet.h b/fastdeploy/vision/headpose/contrib/fsanet.h
new file mode 100644
index 0000000000..8e0ce34622
--- /dev/null
+++ b/fastdeploy/vision/headpose/contrib/fsanet.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/common/processors/transform.h"
+#include "fastdeploy/vision/common/result.h"
+
+namespace fastdeploy {
+
+namespace vision {
+
+namespace headpose {
+/*! @brief FSANet model object used when to load a FSANet model exported by FSANet.
+ */
+class FASTDEPLOY_DECL FSANet : public FastDeployModel {
+ public:
+  /** \brief  Set path of model file and the configuration of runtime.
+   *
+   * \param[in] model_file Path of model file, e.g ./fsanet-var.onnx
+   * \param[in] params_file Path of parameter file, e.g ppyoloe/model.pdiparams, if the model format is ONNX, this parameter will be ignored
+   * \param[in] custom_option RuntimeOption for inference, the default will use cpu, and choose the backend defined in "valid_cpu_backends"
+   * \param[in] model_format Model format of the loaded model, default is ONNX format
+   */
+  FSANet(const std::string& model_file, const std::string& params_file = "",
+             const RuntimeOption& custom_option = RuntimeOption(),
+             const ModelFormat& model_format = ModelFormat::ONNX);
+
+  std::string ModelName() const { return "FSANet"; }
+  /** \brief Predict the face detection result for an input image
+   *
+   * \param[in] im The input image data, comes from cv::imread(), is a 3-D array with layout HWC, BGR format
+   * \param[in] result The output face detection result will be writen to this structure
+   * \return true if the prediction successed, otherwise false
+   */
+  virtual bool Predict(cv::Mat* im, HeadPoseResult* result);
+
+  /// tuple of (width, height), default (64, 64)
+  std::vector<int> size;
+
+ private:
+  bool Initialize();
+
+  bool Preprocess(Mat* mat, FDTensor* outputs,
+                  std::map<std::string, std::array<int, 2>>* im_info);
+
+  bool Postprocess(FDTensor& infer_result, HeadPoseResult* result,
+                   const std::map<std::string, std::array<int, 2>>& im_info);
+};
+
+}  // namespace headpose
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/headpose/contrib/fsanet_pybind.cc b/fastdeploy/vision/headpose/contrib/fsanet_pybind.cc
new file mode 100644
index 0000000000..89a313ac6c
--- /dev/null
+++ b/fastdeploy/vision/headpose/contrib/fsanet_pybind.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+void BindFSANet(pybind11::module& m) {
+  pybind11::class_<vision::headpose::FSANet, FastDeployModel>(m, "FSANet")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>())
+      .def("predict",
+           [](vision::headpose::FSANet& self, pybind11::array& data) {
+             auto mat = PyArrayToCvMat(data);
+             vision::HeadPoseResult res;
+             self.Predict(&mat, &res);
+             return res;
+           })
+      .def_readwrite("size", &vision::headpose::FSANet::size);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/headpose/headpose_pybind.cc b/fastdeploy/vision/headpose/headpose_pybind.cc
new file mode 100644
index 0000000000..4992fee285
--- /dev/null
+++ b/fastdeploy/vision/headpose/headpose_pybind.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/pybind/main.h"
+
+namespace fastdeploy {
+
+void BindFSANet(pybind11::module& m);
+
+void BindHeadPose(pybind11::module& m) {
+  auto headpose_module = m.def_submodule("headpose", "Headpose models.");
+  BindFSANet(headpose_module);
+}
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/tracking/pptracking/trajectory.h b/fastdeploy/vision/tracking/pptracking/trajectory.h
old mode 100644
new mode 100755
diff --git a/fastdeploy/vision/vision_pybind.cc b/fastdeploy/vision/vision_pybind.cc
old mode 100644
new mode 100755
index c6d4494f5d..a1fc6cac68
--- a/fastdeploy/vision/vision_pybind.cc
+++ b/fastdeploy/vision/vision_pybind.cc
@@ -26,6 +26,7 @@ void BindFaceId(pybind11::module& m);
 void BindOcr(pybind11::module& m);
 void BindTracking(pybind11::module& m);
 void BindKeyPointDetection(pybind11::module& m);
+void BindHeadPose(pybind11::module& m);
 #ifdef ENABLE_VISION_VISUALIZE
 void BindVisualize(pybind11::module& m);
 #endif
@@ -113,8 +114,7 @@ void BindVision(pybind11::module& m) {
       .def("__repr__", &vision::MattingResult::Str)
       .def("__str__", &vision::MattingResult::Str);
 
-  pybind11::class_<vision::KeyPointDetectionResult>(m,
-                                                    "KeyPointDetectionResult")
+  pybind11::class_<vision::KeyPointDetectionResult>(m, "KeyPointDetectionResult")
       .def(pybind11::init())
       .def_readwrite("keypoints", &vision::KeyPointDetectionResult::keypoints)
       .def_readwrite("scores", &vision::KeyPointDetectionResult::scores)
@@ -122,6 +122,12 @@ void BindVision(pybind11::module& m) {
       .def("__repr__", &vision::KeyPointDetectionResult::Str)
       .def("__str__", &vision::KeyPointDetectionResult::Str);
 
+  pybind11::class_<vision::HeadPoseResult>(m, "HeadPoseResult")
+      .def(pybind11::init())
+      .def_readwrite("euler_angles", &vision::HeadPoseResult::euler_angles)
+      .def("__repr__", &vision::HeadPoseResult::Str)
+      .def("__str__", &vision::HeadPoseResult::Str);
+
   m.def("enable_flycv", &vision::EnableFlyCV, "Enable image preprocessing by FlyCV.");
   m.def("disable_flycv", &vision::DisableFlyCV, "Disable image preprocessing by FlyCV, change to use OpenCV.");
 
@@ -135,6 +141,7 @@ void BindVision(pybind11::module& m) {
   BindOcr(m);
   BindTracking(m);
   BindKeyPointDetection(m);
+  BindHeadPose(m);
 #ifdef ENABLE_VISION_VISUALIZE
   BindVisualize(m);
 #endif
diff --git a/fastdeploy/vision/visualize/headpose.cc b/fastdeploy/vision/visualize/headpose.cc
new file mode 100644
index 0000000000..389d111361
--- /dev/null
+++ b/fastdeploy/vision/visualize/headpose.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef ENABLE_VISION_VISUALIZE
+
+#include "fastdeploy/vision/visualize/visualize.h"
+#include "opencv2/imgproc/imgproc.hpp"
+
+namespace fastdeploy {
+
+namespace vision {
+
+cv::Mat VisHeadPose(const cv::Mat& im, const HeadPoseResult& result,
+                    int size, int line_size) {
+  const float PI = 3.1415926535;
+  auto vis_im = im.clone();
+  int h = im.rows;
+  int w = im.cols;
+  // vis headpose
+  float pitch = result.euler_angles[0] * PI / 180.f;
+  float yaw = -result.euler_angles[1] * PI / 180.f;
+  float roll = result.euler_angles[2] * PI / 180.f;
+
+  int tdx = w / 2;
+  int tdy = h / 2;
+
+  // X-Axis | drawn in red
+  int x1 = static_cast<int>(size * std::cos(yaw) * std::cos(roll)) + tdx;
+  int y1 = static_cast<int>(size * (std::cos(pitch) * std::sin(roll) + 
+                            std::cos(roll) * std::sin(pitch) * std::sin(yaw))) + tdy;
+  // Y-Axis | drawn in green
+  int x2 = static_cast<int>(-size * std::cos(yaw) * std::sin(roll)) + tdx;
+  int y2 = static_cast<int>(size * (std::cos(pitch) * std::cos(roll) - 
+                            std::sin(pitch) * std::sin(yaw) * std::sin(roll))) + tdy;
+  // Z-Axis | drawn in blue
+  int x3 = static_cast<int>(size * std::sin(yaw)) + tdx;
+  int y3 = static_cast<int>(-size * std::cos(yaw) * std::sin(pitch)) + tdy;
+
+  cv::line(vis_im, cv::Point2i(tdx, tdy), cv::Point2i(x1, y1), cv::Scalar(0, 0, 255), line_size);
+  cv::line(vis_im, cv::Point2i(tdx, tdy), cv::Point2i(x2, y2), cv::Scalar(0, 255, 0), line_size);
+  cv::line(vis_im, cv::Point2i(tdx, tdy), cv::Point2i(x3, y3), cv::Scalar(255, 0, 0), line_size);
+  return vis_im;
+}
+
+}  // namespace vision
+}  // namespace fastdeploy
+
+#endif
\ No newline at end of file
diff --git a/fastdeploy/vision/visualize/visualize.h b/fastdeploy/vision/visualize/visualize.h
old mode 100644
new mode 100755
index d874409d0c..ea543553fa
--- a/fastdeploy/vision/visualize/visualize.h
+++ b/fastdeploy/vision/visualize/visualize.h
@@ -94,8 +94,12 @@ FASTDEPLOY_DECL cv::Mat SwapBackground(const cv::Mat& im,
                                        const SegmentationResult& result,
                                        int background_label);
 FASTDEPLOY_DECL cv::Mat VisKeypointDetection(const cv::Mat& im,
-                                      const KeyPointDetectionResult& results,
-                                      float conf_threshold = 0.5f);
+                        const KeyPointDetectionResult& results,
+                        float conf_threshold = 0.5f);
+FASTDEPLOY_DECL cv::Mat VisHeadPose(const cv::Mat& im,
+                                    const HeadPoseResult& result,
+                                    int size = 50,
+                                    int line_size = 1);
 
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/visualize/visualize_pybind.cc b/fastdeploy/vision/visualize/visualize_pybind.cc
old mode 100644
new mode 100755
index 7633579cc8..739fa7e809
--- a/fastdeploy/vision/visualize/visualize_pybind.cc
+++ b/fastdeploy/vision/visualize/visualize_pybind.cc
@@ -102,6 +102,16 @@ void BindVisualize(pybind11::module& m) {
              FDTensor out;
              vision::Mat(vis_im).ShareWithTensor(&out);
              return TensorToPyArray(out);
+           })
+      .def("vis_headpose",
+           [](pybind11::array& im_data, vision::HeadPoseResult& result,
+              int size, int line_size) {
+             auto im = PyArrayToCvMat(im_data);
+             auto vis_im =
+                 vision::VisHeadPose(im, result, size, line_size);
+             FDTensor out;
+             vision::Mat(vis_im).ShareWithTensor(&out);
+             return TensorToPyArray(out);
            });
 
   pybind11::class_<vision::Visualize>(m, "Visualize")
diff --git a/python/fastdeploy/vision/__init__.py b/python/fastdeploy/vision/__init__.py
old mode 100644
new mode 100755
index d36d0c9af7..047591ccde
--- a/python/fastdeploy/vision/__init__.py
+++ b/python/fastdeploy/vision/__init__.py
@@ -23,6 +23,7 @@ from . import facedet
 from . import facealign
 from . import faceid
 from . import ocr
+from . import headpose
 from . import evaluation
 from .utils import fd_result_to_json
 from .visualize import *
diff --git a/python/fastdeploy/vision/headpose/__init__.py b/python/fastdeploy/vision/headpose/__init__.py
new file mode 100644
index 0000000000..6d90313ef9
--- /dev/null
+++ b/python/fastdeploy/vision/headpose/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from .contrib.fsanet import FSANet
diff --git a/python/fastdeploy/vision/headpose/contrib/__init__.py b/python/fastdeploy/vision/headpose/contrib/__init__.py
new file mode 100644
index 0000000000..8034e10bfc
--- /dev/null
+++ b/python/fastdeploy/vision/headpose/contrib/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
diff --git a/python/fastdeploy/vision/headpose/contrib/fsanet.py b/python/fastdeploy/vision/headpose/contrib/fsanet.py
new file mode 100644
index 0000000000..f179293a99
--- /dev/null
+++ b/python/fastdeploy/vision/headpose/contrib/fsanet.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import logging
+from .... import FastDeployModel, ModelFormat
+from .... import c_lib_wrap as C
+
+
+class FSANet(FastDeployModel):
+    def __init__(self,
+                 model_file,
+                 params_file="",
+                 runtime_option=None,
+                 model_format=ModelFormat.ONNX):
+        """Load a headpose model exported by FSANet.
+
+        :param model_file: (str)Path of model file, e.g fsanet/fsanet-var.onnx
+        :param params_file: (str)Path of parameters file, if the model_fomat is ModelFormat.ONNX, this param will be ignored, can be set as empty string
+        :param runtime_option: (fastdeploy.RuntimeOption)RuntimeOption for inference this model, if it's None, will use the default backend on CPU
+        :param model_format: (fastdeploy.ModelForamt)Model format of the loaded model, default is ONNX
+        """
+
+        super(FSANet, self).__init__(runtime_option)
+
+        assert model_format == ModelFormat.ONNX, "FSANet only support model format of ModelFormat.ONNX now."
+        self._model = C.vision.headpose.FSANet(
+            model_file, params_file, self._runtime_option, model_format)
+        assert self.initialized, "FSANet initialize failed."
+
+    def predict(self, input_image):
+        """Predict an input image headpose
+
+        :param im: (numpy.ndarray)The input image data, 3-D array with layout HWC, BGR format
+        :return: HeadPoseResult
+        """
+
+        return self._model.predict(input_image)
+
+    @property
+    def size(self):
+        """
+        Returns the preprocess image size, default (64, 64)
+        """
+        return self._model.size
+
+    @size.setter
+    def size(self, wh):
+        """
+        Set the preprocess image size, default (64, 64)
+        """
+        assert isinstance(wh, (list, tuple)),\
+            "The value to set `size` must be type of tuple or list."
+        assert len(wh) == 2,\
+            "The value to set `size` must contatins 2 elements means [width, height], but now it contains {} elements.".format(
+            len(wh))
+        self._model.size = wh
diff --git a/python/fastdeploy/vision/visualize/__init__.py b/python/fastdeploy/vision/visualize/__init__.py
index ddbd8758e7..a7f7c69cf2 100755
--- a/python/fastdeploy/vision/visualize/__init__.py
+++ b/python/fastdeploy/vision/visualize/__init__.py
@@ -109,3 +109,7 @@ def vis_ppocr(im_data, det_result):
 
 def vis_mot(im_data, mot_result, score_threshold=0.0, records=None):
     return C.vision.vis_mot(im_data, mot_result, score_threshold, records)
+
+
+def vis_headpose(im_data, headpose_result, size=50, line_size=1):
+    return C.vision.vis_headpose(im_data, headpose_result, size, line_size)

From 9fa612c24b5bdffd4874d69fc3ae9e205439a6b5 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Fri, 4 Nov 2022 11:46:29 +0800
Subject: [PATCH 14/18] [Backend] Enable TensorRT BatchedNMSDynamic_TRT plugin
 (#449)

* Enable TensorRT EfficientNMS plugin

* remove some temporary code

* Update trt_backend.cc

* Update utils.h
---
 CMakeLists.txt                               | 18 +++++----
 cmake/paddle2onnx.cmake                      |  3 +-
 fastdeploy/backends/ort/ort_backend.cc       | 19 ++++------
 fastdeploy/backends/tensorrt/trt_backend.cc  | 32 +---------------
 fastdeploy/backends/tensorrt/trt_backend.h   |  4 --
 fastdeploy/runtime.cc                        |  8 ----
 fastdeploy/runtime.h                         |  6 ---
 fastdeploy/vision/detection/ppdet/ppyoloe.cc | 40 +++++++-------------
 8 files changed, 36 insertions(+), 94 deletions(-)
 mode change 100755 => 100644 fastdeploy/runtime.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c5e7522df..2dd5a150a2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,6 @@ if(ANDROID)
 endif()
 
 ############################# Basic Options for FastDeploy ################################
-option(ENABLE_PADDLE_FRONTEND "Whether to enable PaddlePaddle frontend to support load paddle model in fastdeploy." ON)
 option(WITH_GPU "Whether WITH_GPU=ON, will enable onnxruntime-gpu/paddle-infernce-gpu/poros-gpu" OFF)
 option(WITH_IPU "Whether WITH_IPU=ON, will enable paddle-infernce-ipu" OFF)
 option(ENABLE_ORT_BACKEND "Whether to enable onnxruntime backend." OFF)
@@ -190,13 +189,8 @@ if(WITH_SW)
   add_definitions(-DEIGEN_AVOID_THREAD_LOCAL)
 endif()
 
-if(ENABLE_PADDLE_FRONTEND)
-  add_definitions(-DENABLE_PADDLE_FRONTEND)
-  include(${PROJECT_SOURCE_DIR}/cmake/paddle2onnx.cmake)
-  list(APPEND DEPEND_LIBS external_paddle2onnx)
-endif(ENABLE_PADDLE_FRONTEND)
-
 if(ENABLE_ORT_BACKEND)
+  set(ENABLE_PADDLE_FRONTEND ON)
   add_definitions(-DENABLE_ORT_BACKEND)
   list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_ORT_SRCS})
   include(${PROJECT_SOURCE_DIR}/cmake/onnxruntime.cmake)
@@ -224,6 +218,7 @@ if(ENABLE_PADDLE_BACKEND)
 endif()
 
 if(ENABLE_OPENVINO_BACKEND)
+  set(ENABLE_PADDLE_FRONTEND ON)
   add_definitions(-DENABLE_OPENVINO_BACKEND)
   list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_OPENVINO_SRCS})
   include(${PROJECT_SOURCE_DIR}/cmake/openvino.cmake)
@@ -329,6 +324,7 @@ if(WITH_IPU)
 endif()
 
 if(ENABLE_TRT_BACKEND)
+  set(ENABLE_PADDLE_FRONTEND ON)
   if(APPLE OR ANDROID OR IOS)
     message(FATAL_ERROR "Cannot enable tensorrt backend in mac/ios/android os, please set -DENABLE_TRT_BACKEND=OFF.")
   endif()
@@ -382,7 +378,6 @@ endif()
 
 if(ENABLE_VISION)
   add_definitions(-DENABLE_VISION)
-#  set(ENABLE_VISION_VISUALIZE ON)
   add_definitions(-DENABLE_VISION_VISUALIZE)
   if(ENABLE_OPENCV_CUDA)
     if(NOT WITH_GPU)
@@ -424,6 +419,13 @@ if(ENABLE_TEXT)
   include(${PROJECT_SOURCE_DIR}/cmake/faster_tokenizer.cmake)
 endif()
 
+if(ENABLE_PADDLE_FRONTEND)
+  add_definitions(-DENABLE_PADDLE_FRONTEND)
+  include(${PROJECT_SOURCE_DIR}/cmake/paddle2onnx.cmake)
+  list(APPEND DEPEND_LIBS external_paddle2onnx)
+endif(ENABLE_PADDLE_FRONTEND)
+
+
 configure_file(${PROJECT_SOURCE_DIR}/FastDeploy.cmake.in ${PROJECT_SOURCE_DIR}/FastDeploy.cmake @ONLY)
 configure_file(${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py.in ${PROJECT_SOURCE_DIR}/python/fastdeploy/c_lib_wrap.py)
 configure_file(${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py.in ${PROJECT_SOURCE_DIR}/python/scripts/process_libraries.py)
diff --git a/cmake/paddle2onnx.cmake b/cmake/paddle2onnx.cmake
index de52b6abca..02bcabc716 100755
--- a/cmake/paddle2onnx.cmake
+++ b/cmake/paddle2onnx.cmake
@@ -43,13 +43,14 @@ else()
 endif(WIN32)
 
 set(PADDLE2ONNX_URL_BASE "https://bj.bcebos.com/fastdeploy/third_libs/")
-set(PADDLE2ONNX_VERSION "1.0.1")
+set(PADDLE2ONNX_VERSION "1.0.2rc")
 if(WIN32)
   set(PADDLE2ONNX_FILE "paddle2onnx-win-x64-${PADDLE2ONNX_VERSION}.zip")
   if(NOT CMAKE_CL_64)
     set(PADDLE2ONNX_FILE "paddle2onnx-win-x86-${PADDLE2ONNX_VERSION}.zip")
   endif()
 elseif(APPLE)
+  set(PADDLE2ONNX_VERSION "1.0.1")
   if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
     set(PADDLE2ONNX_FILE "paddle2onnx-osx-arm64-${PADDLE2ONNX_VERSION}.tgz")
   else()
diff --git a/fastdeploy/backends/ort/ort_backend.cc b/fastdeploy/backends/ort/ort_backend.cc
index 3b89be6e50..254abac205 100644
--- a/fastdeploy/backends/ort/ort_backend.cc
+++ b/fastdeploy/backends/ort/ort_backend.cc
@@ -80,21 +80,18 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
             << std::endl;
     return false;
   }
-#ifdef ENABLE_PADDLE_FRONTEND
   char* model_content_ptr;
   int model_content_size = 0;
 
-  std::vector<paddle2onnx::CustomOp> custom_ops;
-  for (auto& item : option.custom_op_info_) {
-    paddle2onnx::CustomOp op;
-    strcpy(op.op_name, item.first.c_str());
-    strcpy(op.export_op_name, item.second.c_str());
-    custom_ops.emplace_back(op);
-  }
+#ifdef ENABLE_PADDLE_FRONTEND
+  paddle2onnx::CustomOp op;
+  strcpy(op.op_name, "multiclass_nms3");
+  strcpy(op.export_op_name, "MultiClassNMS");
+
   if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
                            &model_content_ptr, &model_content_size, 11, true,
-                           verbose, true, true, true, custom_ops.data(),
-                           custom_ops.size())) {
+                           verbose, true, true, true, &op,
+                           1)) {
     FDERROR << "Error occured while export PaddlePaddle to ONNX format."
             << std::endl;
     return false;
@@ -106,7 +103,7 @@ bool OrtBackend::InitFromPaddle(const std::string& model_file,
   model_content_ptr = nullptr;
   return InitFromOnnx(onnx_model_proto, option, true);
 #else
-  FDERROR << "Didn't compile with PaddlePaddle frontend, you can try to "
+  FDERROR << "Didn't compile with PaddlePaddle Frontend, you can try to "
              "call `InitFromOnnx` instead."
           << std::endl;
 #endif
diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
index ca10785599..ba6c329512 100755
--- a/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -124,48 +124,20 @@ bool TrtBackend::InitFromPaddle(const std::string& model_file,
   option_ = option;
 
 #ifdef ENABLE_PADDLE_FRONTEND
-  std::vector<paddle2onnx::CustomOp> custom_ops;
-  for (auto& item : option_.custom_op_info_) {
-    paddle2onnx::CustomOp op;
-    std::strcpy(op.op_name, item.first.c_str());
-    std::strcpy(op.export_op_name, item.second.c_str());
-    custom_ops.emplace_back(op);
-  }
   char* model_content_ptr;
   int model_content_size = 0;
   char* calibration_cache_ptr;
   int calibration_cache_size = 0;
   if (!paddle2onnx::Export(model_file.c_str(), params_file.c_str(),
                            &model_content_ptr, &model_content_size, 11, true,
-                           verbose, true, true, true, custom_ops.data(),
-                           custom_ops.size(), "tensorrt",
+                           verbose, true, true, true, nullptr,
+                           0, "tensorrt",
                            &calibration_cache_ptr, &calibration_cache_size)) {
     FDERROR << "Error occured while export PaddlePaddle to ONNX format."
             << std::endl;
     return false;
   }
 
-  if (option_.remove_multiclass_nms_) {
-    char* new_model = nullptr;
-    int new_model_size = 0;
-    if (!paddle2onnx::RemoveMultiClassNMS(model_content_ptr, model_content_size,
-                                          &new_model, &new_model_size)) {
-      FDERROR << "Try to remove MultiClassNMS failed." << std::endl;
-      return false;
-    }
-    delete[] model_content_ptr;
-    std::string onnx_model_proto(new_model, new_model + new_model_size);
-    delete[] new_model;
-    if (calibration_cache_size) {
-      std::string calibration_str(
-          calibration_cache_ptr,
-          calibration_cache_ptr + calibration_cache_size);
-      calibration_str_ = calibration_str;
-      delete[] calibration_cache_ptr;
-    }
-    return InitFromOnnx(onnx_model_proto, option, true);
-  }
-
   std::string onnx_model_proto(model_content_ptr,
                                model_content_ptr + model_content_size);
   delete[] model_content_ptr;
diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
index 63782e9d0f..cb107af490 100755
--- a/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -73,10 +73,6 @@ struct TrtBackendOption {
   std::string serialize_file = "";
   bool enable_pinned_memory = false;
   void* external_stream_ = nullptr;
-
-  // inside parameter, maybe remove next version
-  bool remove_multiclass_nms_ = false;
-  std::map<std::string, std::string> custom_op_info_;
 };
 
 std::vector<int> toVec(const nvinfer1::Dims& dim);
diff --git a/fastdeploy/runtime.cc b/fastdeploy/runtime.cc
index 86c533f6e6..0a9dff5358 100755
--- a/fastdeploy/runtime.cc
+++ b/fastdeploy/runtime.cc
@@ -675,10 +675,6 @@ void Runtime::CreateOrtBackend() {
   ort_option.gpu_id = option.device_id;
   ort_option.external_stream_ = option.external_stream_;
 
-  // TODO(jiangjiajun): inside usage, maybe remove this later
-  ort_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
-  ort_option.custom_op_info_ = option.custom_op_info_;
-
   FDASSERT(option.model_format == ModelFormat::PADDLE ||
                option.model_format == ModelFormat::ONNX,
            "OrtBackend only support model format of ModelFormat::PADDLE / "
@@ -715,10 +711,6 @@ void Runtime::CreateTrtBackend() {
   trt_option.enable_pinned_memory = option.enable_pinned_memory;
   trt_option.external_stream_ = option.external_stream_;
 
-  // TODO(jiangjiajun): inside usage, maybe remove this later
-  trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
-  trt_option.custom_op_info_ = option.custom_op_info_;
-
   FDASSERT(option.model_format == ModelFormat::PADDLE ||
                option.model_format == ModelFormat::ONNX,
            "TrtBackend only support model format of ModelFormat::PADDLE / "
diff --git a/fastdeploy/runtime.h b/fastdeploy/runtime.h
old mode 100755
new mode 100644
index 634c1f6333..e50e262c2f
--- a/fastdeploy/runtime.h
+++ b/fastdeploy/runtime.h
@@ -338,12 +338,6 @@ struct FASTDEPLOY_DECL RuntimeOption {
   std::string model_file = "";  // Path of model file
   std::string params_file = ""; // Path of parameters file, can be empty
   ModelFormat model_format = ModelFormat::AUTOREC; // format of input model
-
-  // inside parameters, only for inside usage
-  // remove multiclass_nms in Paddle2ONNX
-  bool remove_multiclass_nms_ = false;
-  // for Paddle2ONNX to export custom operators
-  std::map<std::string, std::string> custom_op_info_;
 };
 
 /*! @brief Runtime object used to inference the loaded model on different devices
diff --git a/fastdeploy/vision/detection/ppdet/ppyoloe.cc b/fastdeploy/vision/detection/ppdet/ppyoloe.cc
index c041d3ec20..cd6bc463e0 100644
--- a/fastdeploy/vision/detection/ppdet/ppyoloe.cc
+++ b/fastdeploy/vision/detection/ppdet/ppyoloe.cc
@@ -46,13 +46,6 @@ void PPYOLOE::GetNmsInfo() {
 }
 
 bool PPYOLOE::Initialize() {
-#ifdef ENABLE_PADDLE_FRONTEND
-  // remove multiclass_nms3 now
-  // this is a trick operation for ppyoloe while inference on trt
-  GetNmsInfo();
-  runtime_option.remove_multiclass_nms_ = true;
-  runtime_option.custom_op_info_["multiclass_nms3"] = "MultiClassNMS";
-#endif
   if (!BuildPreprocessPipelineFromConfig()) {
     FDERROR << "Failed to build preprocess pipeline from configuration file."
             << std::endl;
@@ -63,16 +56,6 @@ bool PPYOLOE::Initialize() {
     return false;
   }
 
-  if (has_nms_ && runtime_option.backend == Backend::TRT) {
-    FDINFO << "Detected operator multiclass_nms3 in your model, will replace "
-              "it with fastdeploy::backend::MultiClassNMS(background_label="
-           << background_label << ", keep_top_k=" << keep_top_k
-           << ", nms_eta=" << nms_eta << ", nms_threshold=" << nms_threshold
-           << ", score_threshold=" << score_threshold
-           << ", nms_top_k=" << nms_top_k << ", normalized=" << normalized
-           << ")." << std::endl;
-    has_nms_ = false;
-  }
   return true;
 }
 
@@ -198,6 +181,7 @@ bool PPYOLOE::Postprocess(std::vector<FDTensor>& infer_result,
   FDASSERT(infer_result[1].shape[0] == 1,
            "Only support batch = 1 in FastDeploy now.");
 
+  has_nms_ = true;
   if (!has_nms_) {
     int boxes_index = 0;
     int scores_index = 1;
@@ -237,19 +221,23 @@ bool PPYOLOE::Postprocess(std::vector<FDTensor>& infer_result,
           nms.out_box_data[i * 6 + 4], nms.out_box_data[i * 6 + 5]});
     }
   } else {
-    int box_num = 0;
+    std::vector<int> num_boxes(infer_result[1].shape[0]);
     if (infer_result[1].dtype == FDDataType::INT32) {
-      box_num = *(static_cast<int32_t*>(infer_result[1].Data()));
+      int32_t* data = static_cast<int32_t*>(infer_result[1].Data());
+      for (size_t i = 0; i < infer_result[1].shape[0]; ++i) {
+        num_boxes[i] = static_cast<int>(data[i]);
+      }
     } else if (infer_result[1].dtype == FDDataType::INT64) {
-      box_num = *(static_cast<int64_t*>(infer_result[1].Data()));
-    } else {
-      FDASSERT(
-          false,
-          "The output box_num of PPYOLOE model should be type of int32/int64.");
+      int64_t* data = static_cast<int64_t*>(infer_result[1].Data());
+      for (size_t i = 0; i < infer_result[1].shape[0]; ++i) {
+        num_boxes[i] = static_cast<int>(data[i]);
+      }
     }
-    result->Reserve(box_num);
+
+    // Only support batch = 1 now
+    result->Reserve(num_boxes[0]);
     float* box_data = static_cast<float*>(infer_result[0].Data());
-    for (size_t i = 0; i < box_num; ++i) {
+    for (size_t i = 0; i < num_boxes[0]; ++i) {
       result->label_ids.push_back(box_data[i * 6]);
       result->scores.push_back(box_data[i * 6 + 1]);
       result->boxes.emplace_back(

From 40d2f395ffa344fab94ea5488257228fe99be6d5 Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Fri, 4 Nov 2022 12:14:27 +0800
Subject: [PATCH 15/18] [Doc] Update vision_result doc (#494)

* add paddle_trt in benchmark

* update benchmark in device

* update benchmark

* update result doc
---
 docs/api_docs/python/vision_results_cn.md | 17 +++++++++++++++++
 docs/api_docs/python/vision_results_en.md | 20 ++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/docs/api_docs/python/vision_results_cn.md b/docs/api_docs/python/vision_results_cn.md
index dab22e6a59..19b2a6662d 100644
--- a/docs/api_docs/python/vision_results_cn.md
+++ b/docs/api_docs/python/vision_results_cn.md
@@ -16,6 +16,7 @@ API:`fastdeploy.vision.SegmentationResult`, 该结果返回:
 - **score_map**(list of float): 成员变量，与label_map一一对应的所预测的分割类别概率值(当导出模型时指定`--output_op argmax`)或者经过softmax归一化化后的概率值(当导出模型时指定`--output_op softmax`或者导出模型时指定`--output_op none`同时模型初始化的时候设置模型类成员属性`apply_softmax=true`).
 - **shape**(list of int): 成员变量，表示输出图片的尺寸，为`H*W`.
 
+
 ## DetectionResult
 DetectionResult代码定义在`fastdeploy/vision/common/result.h`中，用于表明图像检测出来的目标框、目标类别和目标置信度.
 
@@ -40,6 +41,7 @@ API:`fastdeploy.vision.FaceDetectionResult` , 该结果返回:
 - **landmarks**(list of list(float)): 成员变量，表示单张图片检测出来的所有人脸的关键点.
 - **landmarks_per_face**(int): 成员变量，表示每个人脸框中的关键点的数量.
 
+
 ## KeyPointDetectionResult
 KeyPointDetectionResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明图像中目标行为的各个关键点坐标和置信度。
 
@@ -70,6 +72,7 @@ API:`fastdeploy.vision.MattingResult`, 该结果返回:
 - **contain_foreground**(bool): 表示预测的结果是否包含前景.
 - **shape**(list of int): 表示输出结果的shape，当`contain_foreground`为`false`，shape只包含`(H,W)`，当`contain_foreground`为`true`，shape包含`(H,W,C)`, C一般为3.
 
+
 ## OCRResult
 OCRResult代码定义在`fastdeploy/vision/common/result.h`中，用于表明图像检测和识别出来的文本框，文本框方向分类，以及文本框内的文本内容.
 
@@ -79,3 +82,17 @@ API:`fastdeploy.vision.OCRResult`, 该结果返回:
 - **rec_scores**(list of float): 成员变量，表示文本框内识别出来的文本的置信度，其元素个数与`boxes.size()`一致.
 - **cls_scores**(list of float): 成员变量，表示文本框的分类结果的置信度，其元素个数与`boxes.size()`一致.
 - **cls_labels**(list of int): 成员变量，表示文本框的方向分类类别，其元素个数与`boxes.size()`一致.
+
+
+## FaceAlignmentResult
+FaceAlignmentResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明人脸landmarks。
+
+API:`fastdeploy.vision.FaceAlignmentResult`, 该结果返回:
+- **landmarks**(list of list(float)): 成员变量，表示单张人脸图片检测出来的所有关键点
+
+
+## HeadPoseResult
+HeadPoseResult 代码定义在`fastdeploy/vision/common/result.h`中，用于表明头部姿态结果。
+
+API:`fastdeploy.vision.HeadPoseResult`, 该结果返回:
+- **euler_angles**(list of float): 成员变量，表示单张人脸图片预测的欧拉角，存放的顺序是(yaw, pitch, roll)， yaw 代表水平转角，pitch 代表垂直角，roll 代表翻滚角，值域都为 [-90, +90]度
diff --git a/docs/api_docs/python/vision_results_en.md b/docs/api_docs/python/vision_results_en.md
index 513a011d7a..cbf4e2d5ae 100644
--- a/docs/api_docs/python/vision_results_en.md
+++ b/docs/api_docs/python/vision_results_en.md
@@ -10,6 +10,7 @@ API: `fastdeploy.vision.ClassifyResult`, The ClassifyResult will return:
 
 - **scores**(list of float):Member variables that indicate the confidence level of a single image on the corresponding classification result, the number of which is determined by the  `topk` passed in when using the classification model, e.g. the confidence level of a Top 5 classification can be returned.
 
+
 ## SegmentationResult
 The code of SegmentationResult is defined in `fastdeploy/vision/common/result.h` and is used to indicate the segmentation category predicted for each pixel in the image and the probability of the segmentation category.
 
@@ -33,6 +34,7 @@ API: `fastdeploy.vision.Mask`, The Mask will return:
 - **data**:Member variable indicating a detected mask.
 - **shape**:Member variable representing the shape of the mask, e.g.  `(H,W)`.
 
+
 ## FaceDetectionResult
 The FaceDetectionResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the target frames detected by face detection, face landmarks, target confidence and the number of landmarks per face.
 
@@ -42,6 +44,7 @@ API: `fastdeploy.vision.FaceDetectionResult`, The FaceDetectionResult will retur
 - **landmarks**(list of list(float)): Member variables that represent the key points of all faces detected by a single image.
 - **landmarks_per_face**(int):Member variable indicating the number of key points in each face frame.
 
+
 ## KeyPointDetectionResult
 The KeyPointDetectionResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the coordinates and confidence of each keypoint of the target behavior in the image.
 
@@ -55,12 +58,14 @@ API:`fastdeploy.vision.KeyPointDetectionResult`, The KeyPointDetectionResult wil
     - `J`: num_joints（number of keypoints for a target）
 - **num_joints**(int): Member variable, representing the number of keypoints for a target
 
+
 ## FaceRecognitionResult
 The FaceRecognitionResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the embedding of the image features by the face recognition model.
 
 API: `fastdeploy.vision.FaceRecognitionResult`, The FaceRecognitionResult will return:
 - **landmarks_per_face**(list of float):Member variables, which indicate the final extracted features embedding of the face recognition model, can be used to calculate the feature similarity between faces.
 
+
 ## MattingResult
 The MattingResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the value of alpha transparency predicted by the model, the predicted outlook, etc.
 
@@ -70,6 +75,7 @@ API:`fastdeploy.vision.MattingResult`, The MattingResult will return:
 - **contain_foreground**(bool):Indicates whether the predicted outcome includes the foreground.
 - **shape**(list of int): When `contain_foreground` is false, the shape only contains `(H,W)`, when `contain_foreground` is `true,` the shape contains `(H,W,C)`, C is generally 3.
 
+
 ## OCRResult
 The OCRResult code is defined in `fastdeploy/vision/common/result.h` and is used to indicate the text box detected in the image, the text box orientation classification, and the text content recognized inside the text box.
 
@@ -79,3 +85,17 @@ API:`fastdeploy.vision.OCRResult`, The OCRResult will return:
 - **rec_scores**(list of float):Member variable indicating the confidence level of the text identified in the box, the number of elements is the same as `boxes.size()`.
 - **cls_scores**(list of float):Member variable indicating the confidence level of the classification result of the text box, with the same number of elements as `boxes.size()`.
 - **cls_labels**(list of int):Member variable indicating the orientation category of the text box, the number of elements is the same as `boxes.size()`.
+
+
+## FaceAlignmentResult
+The code of FaceAlignmentResult is defined in `fastdeploy/vision/common/result.h` and is used to indicate the key points of the face.
+
+API: `fastdeploy.vision.FaceAlignmentResult`, The FaceAlignmentResult will return:
+- **landmarks**(list of list(float)):Member variables that represent the all key points detected from a single face image.
+
+
+## HeadPoseResult
+The code of HeadPoseResult is defined in `fastdeploy/vision/common/result.h` and is used to indicate the head pose result.
+
+API: `fastdeploy.vision.HeadPoseResult`, The HeadPoseResult will return:
+- **euler_angles**(list of float):Member variables that represent the Euler angle predicted by a single face image. The storage order is (yaw, pitch, roll), yaw represents the horizontal angle, pitch represents the vertical angle, roll represents the roll angle, and the value range is [-90, +90] Spend.

From 61634caf286e172b0877c8c539c87b2d27e57ede Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Fri, 4 Nov 2022 15:49:28 +0800
Subject: [PATCH 16/18] [Bug Fix] Fixed for CI (#499)

* add paddle_trt in benchmark

* update benchmark in device

* update benchmark

* update result doc

* fixed for CI
---
 examples/CMakeLists.txt                            | 2 +-
 examples/vision/facealign/pfld/cpp/CMakeLists.txt  | 2 +-
 examples/vision/headpose/fsanet/cpp/CMakeLists.txt | 2 +-
 examples/vision/headpose/fsanet/cpp/infer.cc       | 6 +++---
 fastdeploy/vision/matting/contrib/rvm.cc           | 3 ++-
 5 files changed, 8 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 examples/vision/headpose/fsanet/cpp/CMakeLists.txt
 mode change 100644 => 100755 fastdeploy/vision/matting/contrib/rvm.cc

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 7118460ea6..8aa469b6a5 100755
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -49,7 +49,7 @@ function(add_fastdeploy_executable FIELD CC_FILE)
     add_executable(${TEMP_TARGET_NAME} ${TEMP_TARGET_FILE})
     target_link_libraries(${TEMP_TARGET_NAME} PUBLIC fastdeploy)
     if(TARGET gflags)
-      if(UNIX)
+      if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
         target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags pthread)
       else()
         target_link_libraries(${TEMP_TARGET_NAME} PRIVATE gflags)
diff --git a/examples/vision/facealign/pfld/cpp/CMakeLists.txt b/examples/vision/facealign/pfld/cpp/CMakeLists.txt
index be329f69ac..c417fcb388 100755
--- a/examples/vision/facealign/pfld/cpp/CMakeLists.txt
+++ b/examples/vision/facealign/pfld/cpp/CMakeLists.txt
@@ -11,7 +11,7 @@ include_directories(${FASTDEPLOY_INCS})
 
 add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
 # 添加FastDeploy库依赖
-if(UNIX)
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
   target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
 else()
   target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags)
diff --git a/examples/vision/headpose/fsanet/cpp/CMakeLists.txt b/examples/vision/headpose/fsanet/cpp/CMakeLists.txt
old mode 100644
new mode 100755
index be329f69ac..c417fcb388
--- a/examples/vision/headpose/fsanet/cpp/CMakeLists.txt
+++ b/examples/vision/headpose/fsanet/cpp/CMakeLists.txt
@@ -11,7 +11,7 @@ include_directories(${FASTDEPLOY_INCS})
 
 add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
 # 添加FastDeploy库依赖
-if(UNIX)
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
   target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags pthread)
 else()
   target_link_libraries(infer_demo ${FASTDEPLOY_LIBS} gflags)
diff --git a/examples/vision/headpose/fsanet/cpp/infer.cc b/examples/vision/headpose/fsanet/cpp/infer.cc
index 332f492606..522ec3d954 100644
--- a/examples/vision/headpose/fsanet/cpp/infer.cc
+++ b/examples/vision/headpose/fsanet/cpp/infer.cc
@@ -44,7 +44,7 @@ bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
     } else if (FLAGS_backend == "trt" || 
                FLAGS_backend == "paddle_trt") {
       option->UseTrtBackend();
-      option.SetTrtInputShape("images", {1, 3, 64, 64});
+      option->SetTrtInputShape("images", {1, 3, 64, 64});
       if (FLAGS_backend == "paddle_trt") {
         option->EnablePaddleToTrt();
       }
@@ -54,7 +54,7 @@ bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
     } else if (FLAGS_backend == "default") {
       return true;
     } else {
-      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAG_backend << " is not supported." << std::endl;
+      std::cout << "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, " << FLAGS_backend << " is not supported." << std::endl;
       return false;
     }
   } else if (FLAGS_device == "cpu") {
@@ -67,7 +67,7 @@ bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
     } else if (FLAGS_backend == "default") {
       return true;
     } else {
-      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAG_backend << " is not supported." << std::endl;
+      std::cout << "While inference with CPU, only support default/ort/ov/paddle now, " << FLAGS_backend << " is not supported." << std::endl;
       return false;
     }
   } else {
diff --git a/fastdeploy/vision/matting/contrib/rvm.cc b/fastdeploy/vision/matting/contrib/rvm.cc
old mode 100644
new mode 100755
index ec8ed19fc4..6f48a38652
--- a/fastdeploy/vision/matting/contrib/rvm.cc
+++ b/fastdeploy/vision/matting/contrib/rvm.cc
@@ -138,7 +138,8 @@ bool RobustVideoMatting::Postprocess(
 
   result->Clear();
   result->contain_foreground = true;
-  result->shape = {static_cast<int64_t>(in_h), static_cast<int64_t>(in_w)};
+  // if contain_foreground == true, shape must set to (h, w, c)
+  result->shape = {static_cast<int64_t>(in_h), static_cast<int64_t>(in_w), 3};
   int numel = in_h * in_w;
   int nbytes = numel * sizeof(float);
   result->Resize(numel);

From 277bec38c74f38d69933c1d061ac5cbfc7507fe6 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Fri, 4 Nov 2022 17:16:40 +0800
Subject: [PATCH 17/18] [Backend & Serving] Serving and Runtime support Clone
 (#464)

* Add Serving and Runtime use Clone

* support TRT, OpenVINO and Paddle Backend

Co-authored-by: Jason <jiangjiajun@baidu.com>
---
 fastdeploy/backends/backend.h                |   6 +
 fastdeploy/backends/openvino/ov_backend.cc   |  45 ++++-
 fastdeploy/backends/openvino/ov_backend.h    |  10 +-
 fastdeploy/backends/paddle/paddle_backend.cc |  24 +++
 fastdeploy/backends/paddle/paddle_backend.h  |   7 +
 fastdeploy/backends/tensorrt/trt_backend.cc  |  55 ++++++-
 fastdeploy/backends/tensorrt/trt_backend.h   |  11 +-
 fastdeploy/core/fd_type.cc                   |  27 +++
 fastdeploy/core/fd_type.h                    |  12 ++
 fastdeploy/runtime.cc                        |  59 ++++---
 fastdeploy/runtime.h                         |  68 ++++----
 serving/docs/zh_CN/model_configuration.md    |   4 +-
 serving/src/fastdeploy_runtime.cc            | 165 ++++++++++---------
 13 files changed, 343 insertions(+), 150 deletions(-)

diff --git a/fastdeploy/backends/backend.h b/fastdeploy/backends/backend.h
index 620aea9f4b..652d94cb88 100644
--- a/fastdeploy/backends/backend.h
+++ b/fastdeploy/backends/backend.h
@@ -21,6 +21,7 @@
 
 #include "fastdeploy/backends/common/multiclass_nms.h"
 #include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/core/fd_type.h"
 
 namespace fastdeploy {
 
@@ -63,6 +64,11 @@ class BaseBackend {
   virtual std::vector<TensorInfo> GetOutputInfos() = 0;
   virtual bool Infer(std::vector<FDTensor>& inputs,
                      std::vector<FDTensor>* outputs) = 0;
+  virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                             int device_id = -1) {
+    FDERROR << "Clone no support" << std::endl;
+    return nullptr;
+  }
 };
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/backends/openvino/ov_backend.cc b/fastdeploy/backends/openvino/ov_backend.cc
index f205b48e2e..5a664fc877 100644
--- a/fastdeploy/backends/openvino/ov_backend.cc
+++ b/fastdeploy/backends/openvino/ov_backend.cc
@@ -74,6 +74,8 @@ ov::element::Type FDDataTypeToOV(const FDDataType& type) {
   return ov::element::f32;
 }
 
+ov::Core OpenVINOBackend::core_;
+
 void OpenVINOBackend::InitTensorInfo(
     const std::vector<ov::Output<ov::Node>>& ov_outputs,
     std::map<std::string, TensorInfo>* tensor_infos) {
@@ -96,10 +98,6 @@ bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
     return false;
   }
   option_ = option;
-  ov::AnyMap properties;
-  if (option_.cpu_thread_num > 0) {
-    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
-  }
 
   std::shared_ptr<ov::Model> model = core_.read_model(model_file, params_file);
 
@@ -149,7 +147,19 @@ bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
     output_infos_.push_back(iter->second);
   }
 
+  ov::AnyMap properties;
+  if (option_.cpu_thread_num > 0) {
+    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
+  }
+  if (option_.ov_num_streams ==  -1) {
+    properties["NUM_STREAMS"] = ov::streams::AUTO;
+  } else if (option_.ov_num_streams ==  -2) {
+    properties["NUM_STREAMS"] = ov::streams::NUMA;
+  } else if (option_.ov_num_streams > 0) {
+    properties["NUM_STREAMS"] = option_.ov_num_streams;
+  }
   compiled_model_ = core_.compile_model(model, "CPU", properties);
+
   request_ = compiled_model_.create_infer_request();
   initialized_ = true;
   return true;
@@ -185,10 +195,6 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
     return false;
   }
   option_ = option;
-  ov::AnyMap properties;
-  if (option_.cpu_thread_num > 0) {
-    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
-  }
 
   std::shared_ptr<ov::Model> model = core_.read_model(model_file);
 
@@ -238,8 +244,21 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
     output_infos_.push_back(iter->second);
   }
 
+  ov::AnyMap properties;
+  if (option_.cpu_thread_num > 0) {
+    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
+  }
+  if (option_.ov_num_streams ==  -1) {
+    properties["NUM_STREAMS"] = ov::streams::AUTO;
+  } else if (option_.ov_num_streams ==  -2) {
+    properties["NUM_STREAMS"] = ov::streams::NUMA;
+  } else if (option_.ov_num_streams > 0) {
+    properties["NUM_STREAMS"] = option_.ov_num_streams;
+  }
   compiled_model_ = core_.compile_model(model, "CPU", properties);
+
   request_ = compiled_model_.create_infer_request();
+  
   initialized_ = true;
   return true;
 }
@@ -281,4 +300,14 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
   return true;
 }
 
+std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<OpenVINOBackend>();
+  auto casted_backend = dynamic_cast<OpenVINOBackend*>(new_backend.get());
+  casted_backend->option_ = option_;
+  casted_backend->request_ = compiled_model_.create_infer_request();
+  casted_backend->input_infos_.assign(input_infos_.begin(), input_infos_.end());
+  casted_backend->output_infos_.assign(output_infos_.begin(), output_infos_.end());
+  return new_backend;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/backends/openvino/ov_backend.h b/fastdeploy/backends/openvino/ov_backend.h
index 5dd362d52a..b7d77e58ff 100644
--- a/fastdeploy/backends/openvino/ov_backend.h
+++ b/fastdeploy/backends/openvino/ov_backend.h
@@ -20,17 +20,20 @@
 #include <vector>
 
 #include "fastdeploy/backends/backend.h"
+#include "fastdeploy/utils/unique_ptr.h"
 #include "openvino/openvino.hpp"
 
 namespace fastdeploy {
 
 struct OpenVINOBackendOption {
-  int cpu_thread_num = 8;
+  int cpu_thread_num = -1;
+  int ov_num_streams = 1;
   std::map<std::string, std::vector<int64_t>> shape_infos;
 };
 
 class OpenVINOBackend : public BaseBackend {
  public:
+  static ov::Core core_;
   OpenVINOBackend() {}
   virtual ~OpenVINOBackend() = default;
 
@@ -54,10 +57,13 @@ class OpenVINOBackend : public BaseBackend {
   std::vector<TensorInfo> GetInputInfos() override;
   std::vector<TensorInfo> GetOutputInfos() override;
 
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
+
  private:
   void InitTensorInfo(const std::vector<ov::Output<ov::Node>>& ov_outputs,
                       std::map<std::string, TensorInfo>* tensor_infos);
-  ov::Core core_;
+
   ov::CompiledModel compiled_model_;
   ov::InferRequest request_;
   OpenVINOBackendOption option_;
diff --git a/fastdeploy/backends/paddle/paddle_backend.cc b/fastdeploy/backends/paddle/paddle_backend.cc
index 61e5fb414f..70d8305c51 100644
--- a/fastdeploy/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/backends/paddle/paddle_backend.cc
@@ -216,6 +216,30 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
   return true;
 }
 
+std::unique_ptr<BaseBackend> PaddleBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<PaddleBackend>();
+  auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
+  if(device_id > 0 && option_.use_gpu == true && device_id != option_.gpu_id) {
+    auto clone_option = option_;
+    clone_option.gpu_id = device_id;
+    clone_option.external_stream_ = stream;
+    casted_backend->InitFromPaddle(clone_option.model_file,
+                                   clone_option.params_file,
+                                   clone_option);
+    FDWARNING << "The target device id:" 
+             << device_id
+             << " is different from current device id:"
+             << option_.gpu_id
+             << ", cannot share memory with current engine."
+             << std::endl;
+    return new_backend;
+  }
+  casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
+  casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
+  casted_backend->predictor_ = std::move(predictor_->Clone(stream));
+  return new_backend;
+}
+
 #ifdef ENABLE_TRT_BACKEND
 void PaddleBackend::SetTRTDynamicShapeToConfig(const PaddleBackendOption& option) {
     std::map<std::string, std::vector<int>> max_shape;
diff --git a/fastdeploy/backends/paddle/paddle_backend.h b/fastdeploy/backends/paddle/paddle_backend.h
index 43f8e67e6d..0c674494e9 100755
--- a/fastdeploy/backends/paddle/paddle_backend.h
+++ b/fastdeploy/backends/paddle/paddle_backend.h
@@ -24,6 +24,7 @@
 #include "paddle2onnx/converter.h"
 #endif
 #include "paddle_inference_api.h"  // NOLINT
+#include "fastdeploy/utils/unique_ptr.h"
 
 #ifdef ENABLE_TRT_BACKEND
 #include "fastdeploy/backends/tensorrt/trt_backend.h"
@@ -43,6 +44,9 @@ struct IpuOption {
 };
 
 struct PaddleBackendOption {
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+
 #ifdef WITH_GPU
   bool use_gpu = true;
 #else
@@ -110,6 +114,9 @@ class PaddleBackend : public BaseBackend {
 
   int NumOutputs() const override { return outputs_desc_.size(); }
 
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
+
   TensorInfo GetInputInfo(int index) override;
   TensorInfo GetOutputInfo(int index) override;
   std::vector<TensorInfo> GetInputInfos() override;
diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
index ba6c329512..2306cb2390 100755
--- a/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -285,6 +285,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
     BuildTrtEngine();
   }
 
+  cudaSetDevice(option_.gpu_id);
   SetInputs(inputs);
   AllocateOutputsBuffer(outputs);
 
@@ -356,13 +357,17 @@ void TrtBackend::GetInputOutputInfo() {
       outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
       casted_output_tensors_[name] = FDTensor();
     }
+    io_name_index_[name] = i;
   }
   bindings_.resize(num_binds);
 }
 
 void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
   for (const auto& item : inputs) {
-    auto idx = engine_->getBindingIndex(item.name.c_str());
+    // auto idx = engine_->getBindingIndex(item.name.c_str());
+    auto iter = io_name_index_.find(item.name);
+    FDASSERT(iter != io_name_index_.end(), "TRTBackend SetInputs not find name:%s", item.name.c_str());
+    auto idx = iter->second; 
     std::vector<int> shape(item.shape.begin(), item.shape.end());
     auto dims = ToDims(shape);
     context_->setBindingDimensions(idx, dims);
@@ -410,7 +415,10 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
     outputs->resize(outputs_desc_.size());
   }
   for (size_t i = 0; i < outputs_desc_.size(); ++i) {
-    auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
+    // auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
+    auto idx_iter = io_name_index_.find(outputs_desc_[i].name);
+    FDASSERT(idx_iter != io_name_index_.end(), "TRTBackend Outputs not find name:%s", outputs_desc_[i].name.c_str());
+    auto idx = idx_iter->second; 
     auto output_dims = context_->getBindingDimensions(idx);
 
     // find the original index of output
@@ -673,4 +681,47 @@ std::vector<TensorInfo> TrtBackend::GetOutputInfos() {
   return infos;
 }
 
+std::unique_ptr<BaseBackend> TrtBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<TrtBackend>();
+  auto casted_backend = dynamic_cast<TrtBackend*>(new_backend.get());
+  if(device_id > 0 && device_id != option_.gpu_id) {
+    auto clone_option = option_;
+    clone_option.gpu_id = device_id;
+    clone_option.external_stream_ = stream;
+    if (option_.model_format == ModelFormat::ONNX) {
+      FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option),
+              "Clone model from ONNX failed while initialize TrtBackend.");
+    } else {
+      FDASSERT(casted_backend->InitFromPaddle(option_.model_file,
+                                              option_.params_file, clone_option),
+              "Clone model from Paddle failed while initialize TrtBackend.");
+    }
+    FDWARNING << "The target device id:" 
+          << device_id
+          << " is different from current device id:"
+          << option_.gpu_id
+          << ", cannot share memory with current engine."
+          << std::endl;
+    return new_backend;
+  }
+  cudaSetDevice(option_.gpu_id);
+  casted_backend->option_.gpu_id = option_.gpu_id;
+  if (stream) {
+    casted_backend->stream_ = reinterpret_cast<cudaStream_t>(stream);
+  } else {
+    FDASSERT(cudaStreamCreate(&casted_backend->stream_) == 0,
+           "[ERROR] Error occurs while clone calling cudaStreamCreate().");
+  }
+  casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
+  casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
+  casted_backend->outputs_order_.insert(outputs_order_.begin(), outputs_order_.end());
+  casted_backend->shape_range_info_.insert(shape_range_info_.begin(), shape_range_info_.end());
+  casted_backend->engine_ = engine_;
+  casted_backend->context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
+      casted_backend->engine_->createExecutionContext());
+  casted_backend->GetInputOutputInfo();
+  FDINFO << "TRTBackend clone finish." << std::endl;
+  return new_backend;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
index cb107af490..7ef931f90c 100755
--- a/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -25,6 +25,7 @@
 #include "NvOnnxParser.h"
 #include "fastdeploy/backends/backend.h"
 #include "fastdeploy/backends/tensorrt/utils.h"
+#include "fastdeploy/utils/unique_ptr.h"
 
 class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
  public:
@@ -45,7 +46,7 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 
   void writeCalibrationCache(const void* cache,
                              size_t length) noexcept override {
-    std::cout << "NOT IMPLEMENT." << std::endl;
+    fastdeploy::FDERROR << "NOT IMPLEMENT." << std::endl;
   }
 
  private:
@@ -62,6 +63,11 @@ struct TrtValueInfo {
 };
 
 struct TrtBackendOption {
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC;
+
   int gpu_id = 0;
   bool enable_fp16 = false;
   bool enable_int8 = false;
@@ -99,6 +105,8 @@ class TrtBackend : public BaseBackend {
   TensorInfo GetOutputInfo(int index);
   std::vector<TensorInfo> GetInputInfos() override;
   std::vector<TensorInfo> GetOutputInfos() override;
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
 
   ~TrtBackend() {
     if (parser_) {
@@ -119,6 +127,7 @@ class TrtBackend : public BaseBackend {
   std::vector<TrtValueInfo> outputs_desc_;
   std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
   std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
+  std::map<std::string, int> io_name_index_;
 
   std::string calibration_str_;
 
diff --git a/fastdeploy/core/fd_type.cc b/fastdeploy/core/fd_type.cc
index 45ca90a1b6..5712bb2785 100644
--- a/fastdeploy/core/fd_type.cc
+++ b/fastdeploy/core/fd_type.cc
@@ -182,4 +182,31 @@ const FDDataType TypeToDataType<uint8_t>::dtype = UINT8;
 template <>
 const FDDataType TypeToDataType<int8_t>::dtype = INT8;
 
+std::string Str(const ModelFormat& f) {
+  if (f == ModelFormat::PADDLE) {
+    return "ModelFormat::PADDLE";
+  } else if (f == ModelFormat::ONNX) {
+    return "ModelFormat::ONNX";
+  }else if (f == ModelFormat::RKNN) {
+    return "ModelFormat::RKNN";
+  } else if (f == ModelFormat::TORCHSCRIPT) {
+    return "ModelFormat::TORCHSCRIPT";
+  }
+  return "UNKNOWN-ModelFormat";
+}
+
+std::ostream& operator<<(std::ostream& out, const ModelFormat& format) {
+  if (format == ModelFormat::PADDLE) {
+    out << "ModelFormat::PADDLE";
+  } else if (format == ModelFormat::ONNX) {
+    out << "ModelFormat::ONNX";
+  } else if (format == ModelFormat::RKNN) {
+    out << "ModelFormat::RKNN";
+  } else if (format == ModelFormat::TORCHSCRIPT) {
+    out << "ModelFormat::TORCHSCRIPT";
+  }
+  out << "UNKNOWN-ModelFormat";
+  return out;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/core/fd_type.h b/fastdeploy/core/fd_type.h
index 5236601b00..131de20d42 100644
--- a/fastdeploy/core/fd_type.h
+++ b/fastdeploy/core/fd_type.h
@@ -65,4 +65,16 @@ struct FASTDEPLOY_DECL TypeToDataType {
   static const FDDataType dtype;
 };
 
+/*! Deep learning model format */
+enum ModelFormat {
+  AUTOREC,      ///< Auto recognize the model format by model file name
+  PADDLE,       ///< Model with paddlepaddle format
+  ONNX,         ///< Model with ONNX format
+  RKNN,         ///< Model with RKNN format
+  TORCHSCRIPT,  ///< Model with TorchScript format
+};
+
+FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
+                                         const ModelFormat& format);
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime.cc b/fastdeploy/runtime.cc
index 0a9dff5358..94ea9de0b0 100755
--- a/fastdeploy/runtime.cc
+++ b/fastdeploy/runtime.cc
@@ -102,19 +102,6 @@ std::string Str(const Backend& b) {
   return "UNKNOWN-Backend";
 }
 
-std::string Str(const ModelFormat& f) {
-  if (f == ModelFormat::PADDLE) {
-    return "ModelFormat::PADDLE";
-  } else if (f == ModelFormat::ONNX) {
-    return "ModelFormat::ONNX";
-  }else if (f == ModelFormat::RKNN) {
-    return "ModelFormat::RKNN";
-  } else if (f == ModelFormat::TORCHSCRIPT) {
-    return "ModelFormat::TORCHSCRIPT";
-  }
-  return "UNKNOWN-ModelFormat";
-}
-
 std::ostream& operator<<(std::ostream& out, const Backend& backend) {
   if (backend == Backend::ORT) {
     out << "Backend::ORT";
@@ -135,20 +122,6 @@ std::ostream& operator<<(std::ostream& out, const Backend& backend) {
   return out;
 }
 
-std::ostream& operator<<(std::ostream& out, const ModelFormat& format) {
-  if (format == ModelFormat::PADDLE) {
-    out << "ModelFormat::PADDLE";
-  } else if (format == ModelFormat::ONNX) {
-    out << "ModelFormat::ONNX";
-  } else if (format == ModelFormat::RKNN) {
-    out << "ModelFormat::RKNN";
-  } else if (format == ModelFormat::TORCHSCRIPT) {
-    out << "ModelFormat::TORCHSCRIPT";
-  }
-  out << "UNKNOWN-ModelFormat";
-  return out;
-}
-
 bool CheckModelFormat(const std::string& model_file,
                       const ModelFormat& model_format) {
   if (model_format == ModelFormat::PADDLE) {
@@ -411,6 +384,10 @@ void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
   trt_serialize_file = cache_file_path;
 }
 
+void RuntimeOption::SetOpenVINOStreams(int num_streams) {
+  ov_num_streams = num_streams;
+}
+
 bool Runtime::Compile(std::vector<std::vector<FDTensor>>& prewarm_tensors,
                       const RuntimeOption& _option) {
 #ifdef ENABLE_POROS_BACKEND
@@ -582,6 +559,8 @@ bool Runtime::Infer(std::vector<FDTensor>& input_tensors,
 void Runtime::CreatePaddleBackend() {
 #ifdef ENABLE_PADDLE_BACKEND
   auto pd_option = PaddleBackendOption();
+  pd_option.model_file = option.model_file;
+  pd_option.params_file = option.params_file;
   pd_option.enable_mkldnn = option.pd_enable_mkldnn;
   pd_option.enable_log_info = option.pd_enable_log_info;
   pd_option.mkldnn_cache_size = option.pd_mkldnn_cache_size;
@@ -642,6 +621,7 @@ void Runtime::CreateOpenVINOBackend() {
 #ifdef ENABLE_OPENVINO_BACKEND
   auto ov_option = OpenVINOBackendOption();
   ov_option.cpu_thread_num = option.cpu_thread_num;
+  ov_option.ov_num_streams = option.ov_num_streams;
   FDASSERT(option.model_format == ModelFormat::PADDLE ||
                option.model_format == ModelFormat::ONNX,
            "OpenVINOBackend only support model format of ModelFormat::PADDLE / "
@@ -699,6 +679,9 @@ void Runtime::CreateOrtBackend() {
 void Runtime::CreateTrtBackend() {
 #ifdef ENABLE_TRT_BACKEND
   auto trt_option = TrtBackendOption();
+  trt_option.model_file = option.model_file;
+  trt_option.params_file = option.params_file;
+  trt_option.model_format = option.model_format;
   trt_option.gpu_id = option.device_id;
   trt_option.enable_fp16 = option.trt_enable_fp16;
   trt_option.enable_int8 = option.trt_enable_int8;
@@ -771,4 +754,26 @@ void Runtime::CreateRKNPU2Backend() {
 #endif
 }
 
+Runtime* Runtime::Clone(void* stream, int device_id) {
+  Runtime* runtime = new Runtime();
+  if (option.backend != Backend::OPENVINO
+      && option.backend != Backend::PDINFER
+      && option.backend != Backend::TRT
+      ) {
+    runtime->Init(option);
+    FDWARNING << "Only OpenVINO/Paddle Inference/TensorRT support \
+                  clone engine to  reduce CPU/GPU memory usage now. For "
+              << option.backend
+              << ", FastDeploy will create a new engine which \
+                  will not share memory  with the current runtime."
+              << std::endl;
+    return runtime;
+  }
+  FDINFO << "Runtime Clone with Backend:: " << Str(option.backend) << " in " << Str(option.device)
+         << "." << std::endl;
+  runtime->option = option;
+  runtime->backend_ = backend_->Clone(stream, device_id);
+  return runtime;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime.h b/fastdeploy/runtime.h
index e50e262c2f..7ab6f1fb25 100644
--- a/fastdeploy/runtime.h
+++ b/fastdeploy/runtime.h
@@ -35,38 +35,27 @@ namespace fastdeploy {
 
 /*! Inference backend supported in FastDeploy */
 enum Backend {
-  UNKNOWN, ///< Unknown inference backend
+  UNKNOWN,   ///< Unknown inference backend
   ORT,     ///< ONNX Runtime, support Paddle/ONNX format model, CPU / Nvidia GPU
-  TRT,     ///< TensorRT, support Paddle/ONNX format model, Nvidia GPU only
-  PDINFER, ///< Paddle Inference, support Paddle format model, CPU / Nvidia GPU
-  POROS,   ///< Poros, support TorchScript format model, CPU / Nvidia GPU
-  OPENVINO, ///< Intel OpenVINO, support Paddle/ONNX format, CPU only
+  TRT,      ///< TensorRT, support Paddle/ONNX format model, Nvidia GPU only
+  PDINFER,  ///< Paddle Inference, support Paddle format model, CPU / Nvidia GPU
+  POROS,    ///< Poros, support TorchScript format model, CPU / Nvidia GPU
+  OPENVINO,  ///< Intel OpenVINO, support Paddle/ONNX format, CPU only
   LITE,     ///< Paddle Lite, support Paddle format model, ARM CPU only
   RKNPU2,   ///< RKNPU2, support RKNN format model, Rockchip NPU only
 };
 
-/*! Deep learning model format */
-enum ModelFormat {
-  AUTOREC,     ///< Auto recognize the model format by model file name
-  PADDLE,      ///< Model with paddlepaddle format
-  ONNX,        ///< Model with ONNX format
-  RKNN,        ///< Model with RKNN format
-  TORCHSCRIPT, ///< Model with TorchScript format
-};
-
 FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
                                          const Backend& backend);
-FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
-                                         const ModelFormat& format);
 
 /*! Paddle Lite power mode for mobile device. */
 enum LitePowerMode {
-  LITE_POWER_HIGH = 0,      ///< Use Lite Backend with high power mode
-  LITE_POWER_LOW = 1,       ///< Use Lite Backend with low power mode
-  LITE_POWER_FULL = 2,      ///< Use Lite Backend with full power mode
-  LITE_POWER_NO_BIND = 3,   ///< Use Lite Backend with no bind power mode
-  LITE_POWER_RAND_HIGH = 4, ///< Use Lite Backend with rand high mode
-  LITE_POWER_RAND_LOW = 5   ///< Use Lite Backend with rand low power mode
+  LITE_POWER_HIGH = 0,       ///< Use Lite Backend with high power mode
+  LITE_POWER_LOW = 1,        ///< Use Lite Backend with low power mode
+  LITE_POWER_FULL = 2,       ///< Use Lite Backend with full power mode
+  LITE_POWER_NO_BIND = 3,    ///< Use Lite Backend with no bind power mode
+  LITE_POWER_RAND_HIGH = 4,  ///< Use Lite Backend with rand high mode
+  LITE_POWER_RAND_LOW = 5    ///< Use Lite Backend with rand low power mode
 };
 
 FASTDEPLOY_DECL std::string Str(const Backend& b);
@@ -105,8 +94,10 @@ struct FASTDEPLOY_DECL RuntimeOption {
   /// Use Nvidia GPU to inference
   void UseGpu(int gpu_id = 0);
 
-  void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name = fastdeploy::rknpu2::CpuName::RK3588,
-                 fastdeploy::rknpu2::CoreMask rknpu2_core = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);
+  void UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name
+                             = fastdeploy::rknpu2::CpuName::RK3588,
+                 fastdeploy::rknpu2::CoreMask rknpu2_core
+                             = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_0);
 
   void SetExternalStream(void* external_stream);
 
@@ -242,6 +233,11 @@ struct FASTDEPLOY_DECL RuntimeOption {
    */
   void DisablePaddleTrtCollectShape();
 
+  /*
+   * @brief Set number of streams by the OpenVINO backends
+   */
+  void SetOpenVINOStreams(int num_streams);
+
   /** \Use Graphcore IPU to inference.
    *
    * \param[in] device_num the number of IPUs.
@@ -331,13 +327,19 @@ struct FASTDEPLOY_DECL RuntimeOption {
   int unconst_ops_thres = -1;
   std::string poros_file = "";
 
+  // ======Only for OpenVINO Backend=======
+  int ov_num_streams = 1;
+
   // ======Only for RKNPU2 Backend=======
-  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_ = fastdeploy::rknpu2::CpuName::RK3588;
-  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
+  fastdeploy::rknpu2::CpuName rknpu2_cpu_name_
+            = fastdeploy::rknpu2::CpuName::RK3588;
+  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_
+            = fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
 
   std::string model_file = "";  // Path of model file
-  std::string params_file = ""; // Path of parameters file, can be empty
-  ModelFormat model_format = ModelFormat::AUTOREC; // format of input model
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC; 
 };
 
 /*! @brief Runtime object used to inference the loaded model on different devices
@@ -384,6 +386,14 @@ struct FASTDEPLOY_DECL Runtime {
    */
   std::vector<TensorInfo> GetOutputInfos();
 
+  /** \brief Clone new Runtime when multiple instances of the same model are created
+   *
+   * \param[in] stream CUDA Stream, defualt param is nullptr
+   * \return new Runtime* by this clone
+   */
+  Runtime* Clone(void* stream = nullptr,
+                 int device_id = -1);
+
   RuntimeOption option;
 
  private:
@@ -395,4 +405,4 @@ struct FASTDEPLOY_DECL Runtime {
   void CreateRKNPU2Backend();
   std::unique_ptr<BaseBackend> backend_;
 };
-} // namespace fastdeploy
+}  // namespace fastdeploy
diff --git a/serving/docs/zh_CN/model_configuration.md b/serving/docs/zh_CN/model_configuration.md
index 7a19aa8fa9..ce3abc0759 100644
--- a/serving/docs/zh_CN/model_configuration.md
+++ b/serving/docs/zh_CN/model_configuration.md
@@ -142,8 +142,10 @@ optimization {
     cpu_execution_accelerator : [
       {
         name : "openvino"
-        # 设置推理并行计算线程数为4
+        # 设置推理并行计算线程数为4（所有实例总共线程数）
         parameters { key: "cpu_threads" value: "4" }
+        # 设置OpenVINO的num_streams（一般设置为跟实例数一致）
+        parameters { key: "num_streams" value: "1" }
       }
     ]
   }
diff --git a/serving/src/fastdeploy_runtime.cc b/serving/src/fastdeploy_runtime.cc
index 32d0127e0b..2e839b5ac4 100644
--- a/serving/src/fastdeploy_runtime.cc
+++ b/serving/src/fastdeploy_runtime.cc
@@ -91,6 +91,9 @@ class ModelState : public BackendModel {
 
   // Runtime options used when creating a FastDeploy Runtime.
   std::unique_ptr<fastdeploy::RuntimeOption> runtime_options_;
+  bool model_load_;
+  fastdeploy::Runtime* main_runtime_;
+  bool is_clone_ = true;
 
   // model_outputs is a map that contains unique outputs that the model must
   // provide. In the model configuration, the output in the state configuration
@@ -165,7 +168,7 @@ TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model,
 }
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
-    : BackendModel(triton_model) {
+    : BackendModel(triton_model), model_load_(false), main_runtime_(nullptr), is_clone_(true) {
   // Create runtime options that will be cloned and used for each
   // instance when creating that instance's runtime.
   runtime_options_.reset(new fastdeploy::RuntimeOption());
@@ -218,19 +221,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
               THROW_IF_BACKEND_MODEL_ERROR(
                   ParseIntValue(value_string, &cpu_thread_num));
               runtime_options_->SetCpuThreadNum(cpu_thread_num);
-              // } else if (param_key == "graph_level") {
-              //   THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //       value_string, &runtime_options_->ort_graph_opt_level));
-              // } else if (param_key == "inter_op_num_threads") {
-              //   THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //       value_string,
-              //       &runtime_options_->ort_inter_op_num_threads));
-              // } else if (param_key == "execution_mode") {
-              //   THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //       value_string, &runtime_options_->ort_execution_mode));
-              // } else if (param_key == "capacity") {
-              //     THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //     value_string, &runtime_options_->pd_mkldnn_cache_size));
             } else if (param_key == "use_mkldnn") {
               bool pd_enable_mkldnn;
               THROW_IF_BACKEND_MODEL_ERROR(
@@ -238,8 +228,16 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
               runtime_options_->SetPaddleMKLDNN(pd_enable_mkldnn);
             } else if (param_key == "use_paddle_log") {
                 runtime_options_->EnablePaddleLogInfo();
+            } else if (param_key == "num_streams") {
+                int num_streams;
+                THROW_IF_BACKEND_MODEL_ERROR(
+                  ParseIntValue(value_string, &num_streams));
+                runtime_options_->SetOpenVINOStreams(num_streams);
+            } else if (param_key == "is_clone") {
+                THROW_IF_BACKEND_MODEL_ERROR(
+                  ParseBoolValue(value_string, &is_clone_));
             } else if (param_key == "use_ipu") {
-              runtime_options_->UseIpu();
+              // runtime_options_->UseIpu();
             }
           }
         }
@@ -290,17 +288,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
               std::string value_string;
               THROW_IF_BACKEND_MODEL_ERROR(
                   params.MemberAsString(param_key.c_str(), &value_string));
-              // if (param_key == "graph_level") {
-              //   THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //       value_string, &runtime_options_->ort_graph_opt_level));
-              // } else if (param_key == "inter_op_num_threads") {
-              //   THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //       value_string,
-              //       &runtime_options_->ort_inter_op_num_threads));
-              // } else if (param_key == "execution_mode") {
-              //   THROW_IF_BACKEND_MODEL_ERROR(ParseIntValue(
-              //       value_string, &runtime_options_->ort_execution_mode));
-              // }
               if (param_key == "precision") {
                 std::transform(value_string.begin(), value_string.end(),
                                value_string.begin(), ::tolower);
@@ -325,7 +312,10 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
                 runtime_options_->EnablePaddleToTrt();
               } else if (param_key == "use_paddle_log") {
                 runtime_options_->EnablePaddleLogInfo();
-              }
+              } else if (param_key == "is_clone") {
+                THROW_IF_BACKEND_MODEL_ERROR(
+                  ParseBoolValue(value_string, &is_clone_));
+              } 
             }
           }
         }
@@ -340,64 +330,79 @@ TRITONSERVER_Error* ModelState::LoadModel(
     const int32_t instance_group_device_id, std::string* model_path,
     std::string* params_path, fastdeploy::Runtime** runtime,
     cudaStream_t stream) {
-  auto dir_path = JoinPath({RepositoryPath(), std::to_string(Version())});
-  {
-    // ONNX Format
-    bool exists;
-    *model_path = JoinPath({dir_path, "model.onnx"});
-    RETURN_IF_ERROR(FileExists(*model_path, &exists));
+  
+  // FastDeploy Runtime creation is not thread-safe, so multiple creations
+  // are serialized with a global lock.
+  // The Clone interface can be invoked only when the main_runtime_ is created.
+  static std::mutex global_context_mu;
+  std::lock_guard<std::mutex> glock(global_context_mu);
 
-    // Paddle Formax
-    if (not exists) {
-      *model_path = JoinPath({dir_path, "model.pdmodel"});
-      RETURN_IF_ERROR(FileExists(*model_path, &exists));
-      if (not exists) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_NOT_FOUND,
-            std::string(
-                "Model should be named as 'model.onnx' or 'model.pdmodel'")
-                .c_str());
-      }
-      *params_path = JoinPath({dir_path, "model.pdiparams"});
-      RETURN_IF_ERROR(FileExists(*params_path, &exists));
-      if (not exists) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_NOT_FOUND,
-            std::string("Paddle params should be named as 'model.pdiparams' or "
-                        "not provided.'")
-                .c_str());
-      }
-      runtime_options_->model_format = fastdeploy::ModelFormat::PADDLE;
-      runtime_options_->model_file = *model_path;
-      runtime_options_->params_file = *params_path;
-    } else {
-      runtime_options_->model_format = fastdeploy::ModelFormat::ONNX;
-      runtime_options_->model_file = *model_path;
+  if(model_load_ && is_clone_) {
+    if(main_runtime_ == nullptr) {
+      return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,
+                                  std::string("main_runtime is nullptr").c_str());
     }
-  }
-
-  // GPU
-#ifdef TRITON_ENABLE_GPU
-  if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
-      (instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
-    runtime_options_->UseGpu(instance_group_device_id);
-    runtime_options_->SetExternalStream((void*)stream);
+    *runtime = main_runtime_->Clone((void*)stream, instance_group_device_id);
   } else {
-    runtime_options_->UseCpu();
-  }
-#else
-  if (runtime_options_->device != fastdeploy::Device::IPU) {
-    // If Device is set to IPU, just skip CPU setting.
-    runtime_options_->UseCpu();
-  }
-#endif  // TRITON_ENABLE_GPU
+    auto dir_path = JoinPath({RepositoryPath(), std::to_string(Version())});
+    {
+      // ONNX Format
+      bool exists;
+      *model_path = JoinPath({dir_path, "model.onnx"});
+      RETURN_IF_ERROR(FileExists(*model_path, &exists));
 
-  *runtime = new fastdeploy::Runtime();
-  if (!(*runtime)->Init(*runtime_options_)) {
-    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,
-                                 std::string("Runtime init error").c_str());
-  }
+      // Paddle Formax
+      if (not exists) {
+        *model_path = JoinPath({dir_path, "model.pdmodel"});
+        RETURN_IF_ERROR(FileExists(*model_path, &exists));
+        if (not exists) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_NOT_FOUND,
+              std::string(
+                  "Model should be named as 'model.onnx' or 'model.pdmodel'")
+                  .c_str());
+        }
+        *params_path = JoinPath({dir_path, "model.pdiparams"});
+        RETURN_IF_ERROR(FileExists(*params_path, &exists));
+        if (not exists) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_NOT_FOUND,
+              std::string("Paddle params should be named as 'model.pdiparams' or "
+                          "not provided.'")
+                  .c_str());
+        }
+        runtime_options_->model_format = fastdeploy::ModelFormat::PADDLE;
+        runtime_options_->model_file = *model_path;
+        runtime_options_->params_file = *params_path;
+      } else {
+        runtime_options_->model_format = fastdeploy::ModelFormat::ONNX;
+        runtime_options_->model_file = *model_path;
+      }
+    }
 
+    // GPU
+  #ifdef TRITON_ENABLE_GPU
+    if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+        (instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
+      runtime_options_->UseGpu(instance_group_device_id);
+      runtime_options_->SetExternalStream((void*)stream);
+    } else if (runtime_options_->device != fastdeploy::Device::IPU) {
+      runtime_options_->UseCpu();
+    }
+  #else
+    if (runtime_options_->device != fastdeploy::Device::IPU) {
+      // If Device is set to IPU, just skip CPU setting.
+      runtime_options_->UseCpu();
+    }
+  #endif  // TRITON_ENABLE_GPU
+
+    *runtime = main_runtime_ = new fastdeploy::Runtime();
+    if (!(*runtime)->Init(*runtime_options_)) {
+      return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,
+                                  std::string("Runtime init error").c_str());
+    }
+    model_load_ = true;
+  }
   return nullptr;  // success
 }
 

From e453902809b006af0dd14902c1c122bf55789288 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 4 Nov 2022 18:36:55 +0800
Subject: [PATCH 18/18] [Other] Add static create methods to Mat (#497)

* [Other] Add static create methods to Mat

* [Other] avoid field name conflicts
---
 fastdeploy/vision/common/processors/base.cc   | 108 +-------------
 fastdeploy/vision/common/processors/base.h    |   9 +-
 fastdeploy/vision/common/processors/mat.cc    |  73 +++++++++
 fastdeploy/vision/common/processors/mat.h     |  19 ++-
 .../vision/common/processors/proc_lib.cc      |  23 +++
 .../vision/common/processors/proc_lib.h       |  32 ++++
 fastdeploy/vision/common/processors/utils.cc  | 139 +++++++++++++++---
 fastdeploy/vision/common/processors/utils.h   |  10 +-
 fastdeploy/vision/matting/contrib/modnet.cc   |   8 +-
 .../vision/matting/ppmatting/ppmatting.cc     |   2 +-
 fastdeploy/vision/segmentation/ppseg/model.cc |   2 +-
 11 files changed, 283 insertions(+), 142 deletions(-)
 create mode 100644 fastdeploy/vision/common/processors/proc_lib.cc
 create mode 100644 fastdeploy/vision/common/processors/proc_lib.h

diff --git a/fastdeploy/vision/common/processors/base.cc b/fastdeploy/vision/common/processors/base.cc
index f7831ae638..9410f6b612 100644
--- a/fastdeploy/vision/common/processors/base.cc
+++ b/fastdeploy/vision/common/processors/base.cc
@@ -13,18 +13,17 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/common/processors/base.h"
+#include "fastdeploy/vision/common/processors/proc_lib.h"
 
 #include "fastdeploy/utils/utils.h"
 
 namespace fastdeploy {
 namespace vision {
 
-ProcLib Processor::default_lib = ProcLib::DEFAULT;
-
 bool Processor::operator()(Mat* mat, ProcLib lib) {
   ProcLib target = lib;
   if (lib == ProcLib::DEFAULT) {
-    target = default_lib;
+    target = DefaultProcLib::default_lib;
   }
   if (target == ProcLib::FLYCV) {
 #ifdef ENABLE_FLYCV
@@ -39,9 +38,9 @@ bool Processor::operator()(Mat* mat, ProcLib lib) {
 
 void EnableFlyCV() {
 #ifdef ENABLE_FLYCV
-  Processor::default_lib = ProcLib::FLYCV;
+  DefaultProcLib::default_lib = ProcLib::FLYCV;
   FDINFO << "Will change to use image processing library "
-         << Processor::default_lib << std::endl;
+         << DefaultProcLib::default_lib << std::endl;
 #else
   FDWARNING << "FastDeploy didn't compile with FlyCV, "
                "will fallback to use OpenCV instead."
@@ -50,104 +49,9 @@ void EnableFlyCV() {
 }
 
 void DisableFlyCV() {
-  Processor::default_lib = ProcLib::OPENCV;
+  DefaultProcLib::default_lib = ProcLib::OPENCV;
   FDINFO << "Will change to use image processing library "
-         << Processor::default_lib << std::endl;
-}
-
-cv::Mat CreateOpenCVMatFromTensor(const FDTensor& tensor) {
-  FDDataType type = tensor.dtype;
-  FDASSERT(tensor.shape.size() == 3,
-           "When create FD Mat from tensor, tensor shape should be 3-Dim, HWC "
-           "layout");
-  int64_t height = tensor.shape[0];
-  int64_t width = tensor.shape[1];
-  int64_t channel = tensor.shape[2];
-  cv::Mat ocv_mat;
-  // reference to outside FDTensor, zero copy
-  switch (type) {
-    case FDDataType::UINT8:
-      ocv_mat = cv::Mat(height, width, CV_8UC(channel),
-                        const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::INT8:
-      ocv_mat = cv::Mat(height, width, CV_8SC(channel),
-                        const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::INT16:
-      ocv_mat = cv::Mat(height, width, CV_16SC(channel),
-                        const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::INT32:
-      ocv_mat = cv::Mat(height, width, CV_32SC(channel),
-                        const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::FP32:
-      ocv_mat = cv::Mat(height, width, CV_32FC(channel),
-                        const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::FP64:
-      ocv_mat = cv::Mat(height, width, CV_64FC(channel),
-                        const_cast<void*>(tensor.Data()));
-      break;
-    default:
-      FDASSERT(false,
-               "Tensor type %d is not supported While calling "
-               "CreateFDMatFromTensor.",
-               type);
-      break;
-  }
-  return ocv_mat;
-}
-
-#ifdef ENABLE_FLYCV
-fcv::Mat CreateFlyCVMatFromTensor(const FDTensor& tensor) {
-  FDDataType type = tensor.dtype;
-  FDASSERT(tensor.shape.size() == 3,
-           "When create FD Mat from tensor, tensor shape should be 3-Dim, HWC "
-           "layout");
-  int64_t height = tensor.shape[0];
-  int64_t width = tensor.shape[1];
-  int64_t channel = tensor.shape[2];
-  fcv::Mat fcv_mat;
-  auto fcv_type = CreateFlyCVDataType(type, static_cast<int>(channel));
-  switch (type) {
-    case FDDataType::UINT8:
-      fcv_mat =
-        fcv::Mat(width, height, fcv_type, const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::FP32:
-      fcv_mat =
-        fcv::Mat(width, height, fcv_type, const_cast<void*>(tensor.Data()));
-      break;
-    case FDDataType::FP64:
-      fcv_mat =
-        fcv::Mat(width, height, fcv_type, const_cast<void*>(tensor.Data()));
-    break;
-    default:
-      FDASSERT(false,
-              "Tensor type %d is not supported While calling "
-              "CreateFDMatFromTensor.",
-               type);
-    break;
-  }
-  return fcv_mat;
-}
-#endif
-
-Mat CreateFDMatFromTensor(const FDTensor& tensor) {
-  if (Processor::default_lib == ProcLib::FLYCV) {
-#ifdef ENABLE_FLYCV
-    fcv::Mat fcv_mat = CreateFlyCVMatFromTensor(tensor);
-    Mat mat = Mat(fcv_mat);
-    return mat;
-#else
-    FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
-#endif
-  }
-  cv::Mat ocv_mat = CreateOpenCVMatFromTensor(tensor);
-  Mat mat = Mat(ocv_mat);
-  return mat;
+         << DefaultProcLib::default_lib << std::endl;
 }
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/base.h b/fastdeploy/vision/common/processors/base.h
index bfd1e00856..bb414669af 100644
--- a/fastdeploy/vision/common/processors/base.h
+++ b/fastdeploy/vision/common/processors/base.h
@@ -37,7 +37,7 @@ class FASTDEPLOY_DECL Processor {
   // all the function in `processor` will force to use
   // default_lib if this flag is set.
   // DEFAULT means this flag is not set
-  static ProcLib default_lib;
+  // static ProcLib default_lib;
 
   virtual std::string Name() = 0;
 
@@ -50,12 +50,5 @@ class FASTDEPLOY_DECL Processor {
   virtual bool operator()(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
 };
 
-// Create OpenCV/FlyCV/FD Mat from FD Tensor
-cv::Mat CreateOpenCVMatFromTensor(const FDTensor& tensor);
-#ifdef ENABLE_FLYCV
-fcv::Mat CreateFlyCVMatFromTensor(const FDTensor& tensor);
-#endif
-Mat CreateFDMatFromTensor(const FDTensor& tensor);
-
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat.cc b/fastdeploy/vision/common/processors/mat.cc
index db380ba740..ba0eddcb1f 100644
--- a/fastdeploy/vision/common/processors/mat.cc
+++ b/fastdeploy/vision/common/processors/mat.cc
@@ -118,5 +118,78 @@ std::ostream& operator<<(std::ostream& out, const ProcLib& p) {
   return out;
 }
 
+Mat Mat::Create(const FDTensor& tensor) {
+  if (DefaultProcLib::default_lib == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+    fcv::Mat tmp_fcv_mat = CreateZeroCopyFlyCVMatFromTensor(tensor);
+    Mat mat = Mat(tmp_fcv_mat);
+    return mat;
+#else
+    FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
+#endif
+  }
+  cv::Mat tmp_ocv_mat = CreateZeroCopyOpenCVMatFromTensor(tensor);
+  Mat mat = Mat(tmp_ocv_mat);
+  return mat;
+}
+
+Mat Mat::Create(const FDTensor& tensor, ProcLib lib) {
+  if (lib == ProcLib::DEFAULT) {
+    return Create(tensor);
+  }
+  if (lib == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+    fcv::Mat tmp_fcv_mat = CreateZeroCopyFlyCVMatFromTensor(tensor);
+    Mat mat = Mat(tmp_fcv_mat);
+    return mat;
+#else
+    FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
+#endif
+  } 
+  cv::Mat tmp_ocv_mat = CreateZeroCopyOpenCVMatFromTensor(tensor);
+  Mat mat = Mat(tmp_ocv_mat);
+  return mat;
+}
+
+Mat Mat::Create(int height, int width, int channels,
+                FDDataType type, void* data) {
+  if (DefaultProcLib::default_lib == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+    fcv::Mat tmp_fcv_mat = CreateZeroCopyFlyCVMatFromBuffer(
+      height, width, channels, type, data);
+    Mat mat = Mat(tmp_fcv_mat);
+    return mat;
+#else
+    FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
+#endif
+  }
+  cv::Mat tmp_ocv_mat = CreateZeroCopyOpenCVMatFromBuffer(
+      height, width, channels, type, data);
+  Mat mat = Mat(tmp_ocv_mat);
+  return mat;    
+}
+
+Mat Mat::Create(int height, int width, int channels,
+                FDDataType type, void* data,
+                ProcLib lib) {
+  if (lib == ProcLib::DEFAULT) {
+    return Create(height, width, channels, type, data);
+  }                  
+  if (lib == ProcLib::FLYCV) {
+#ifdef ENABLE_FLYCV
+    fcv::Mat tmp_fcv_mat = CreateZeroCopyFlyCVMatFromBuffer(
+      height, width, channels, type, data);
+    Mat mat = Mat(tmp_fcv_mat);
+    return mat;
+#else
+    FDASSERT(false, "FastDeploy didn't compiled with FlyCV!");
+#endif
+  } 
+  cv::Mat tmp_ocv_mat = CreateZeroCopyOpenCVMatFromBuffer(
+      height, width, channels, type, data);
+  Mat mat = Mat(tmp_ocv_mat);
+  return mat;    
+}
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/mat.h b/fastdeploy/vision/common/processors/mat.h
index 995d497ccf..cfec2fbd5e 100644
--- a/fastdeploy/vision/common/processors/mat.h
+++ b/fastdeploy/vision/common/processors/mat.h
@@ -14,12 +14,12 @@
 #pragma once
 #include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/vision/common/processors/utils.h"
+#include "fastdeploy/vision/common/processors/proc_lib.h"
 #include "opencv2/core/core.hpp"
 
 namespace fastdeploy {
 namespace vision {
 
-enum class FASTDEPLOY_DECL ProcLib { DEFAULT, OPENCV, FLYCV };
 enum Layout { HWC, CHW };
 
 FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out, const ProcLib& p);
@@ -45,6 +45,9 @@ struct FASTDEPLOY_DECL Mat {
   }
 #endif
 
+  Mat(const Mat& mat) = default;
+  Mat& operator=(const Mat& mat) = default;
+
   // Careful if you use this interface
   // this only used if you don't want to write
   // the original data, and write to a new cv::Mat
@@ -129,6 +132,20 @@ struct FASTDEPLOY_DECL Mat {
 
   ProcLib mat_type = ProcLib::OPENCV;
   Layout layout = Layout::HWC;
+
+  // Create FD Mat from FD Tensor. This method only create a
+  // new FD Mat with zero copy and it's data pointer is reference
+  // to the original memory buffer of input FD Tensor. Carefully,
+  // any operation on this Mat may change memory that points to
+  // FDTensor. We assume that the memory Mat points to is mutable.
+  // This method will create a FD Mat according to current global
+  // default ProcLib (OPENCV,FLYCV,...).
+  static Mat Create(const FDTensor& tensor);
+  static Mat Create(const FDTensor& tensor, ProcLib lib);
+  static Mat Create(int height, int width, int channels,
+                    FDDataType type, void* data);
+  static Mat Create(int height, int width, int channels,
+                    FDDataType type, void* data, ProcLib lib);
 };
 
 }  // namespace vision
diff --git a/fastdeploy/vision/common/processors/proc_lib.cc b/fastdeploy/vision/common/processors/proc_lib.cc
new file mode 100644
index 0000000000..e5009d9a63
--- /dev/null
+++ b/fastdeploy/vision/common/processors/proc_lib.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision/common/processors/proc_lib.h"
+
+namespace fastdeploy {
+namespace vision {
+
+ProcLib DefaultProcLib::default_lib = ProcLib::DEFAULT;
+
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/proc_lib.h b/fastdeploy/vision/common/processors/proc_lib.h
new file mode 100644
index 0000000000..6db6b5177e
--- /dev/null
+++ b/fastdeploy/vision/common/processors/proc_lib.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "fastdeploy/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+
+enum class FASTDEPLOY_DECL ProcLib { DEFAULT, OPENCV, FLYCV };
+
+struct FASTDEPLOY_DECL DefaultProcLib {
+  // default_lib has the highest priority
+  // all the function in `processor` will force to use
+  // default_lib if this flag is set.
+  // DEFAULT means this flag is not set
+  static ProcLib default_lib;
+};
+
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/utils.cc b/fastdeploy/vision/common/processors/utils.cc
index ec3eba4727..e38a1687c8 100644
--- a/fastdeploy/vision/common/processors/utils.cc
+++ b/fastdeploy/vision/common/processors/utils.cc
@@ -45,6 +45,30 @@ FDDataType OpenCVDataTypeToFD(int type) {
   }
 }
 
+int CreateOpenCVDataType(FDDataType type, int channel) {
+  FDASSERT(channel == 1 || channel == 3 || channel == 4,
+           "Only support channel be 1/3/4 in OpenCV.");
+  if (type == FDDataType::UINT8) {
+    if (channel == 1) {
+      return CV_8UC1;
+    } else if (channel == 3) {
+      return CV_8UC3;
+    } else {
+      return CV_8UC4;
+    }
+  } else if (type == FDDataType::FP32) {
+    if (channel == 1) {
+      return CV_32FC1;
+    } else if (channel == 3) {
+      return CV_32FC3;
+    } else {
+      return CV_32FC4;
+    }
+  }
+  FDASSERT(false, "Data type of %s is not supported.", Str(type).c_str());
+  return CV_32FC3;
+}
+
 #ifdef ENABLE_FLYCV
 FDDataType FlyCVDataTypeToFD(fcv::FCVImageType type) {
   if (type == fcv::FCVImageType::GRAY_U8) {
@@ -137,30 +161,6 @@ fcv::FCVImageType CreateFlyCVDataType(FDDataType type, int channel) {
   return fcv::FCVImageType::PACKAGE_BGR_F32;
 }
 
-int CreateOpenCVDataType(FDDataType type, int channel) {
-  FDASSERT(channel == 1 || channel == 3 || channel == 4,
-           "Only support channel be 1/3/4 in OpenCV.");
-  if (type == FDDataType::UINT8) {
-    if (channel == 1) {
-      return CV_8UC1;
-    } else if (channel == 3) {
-      return CV_8UC3;
-    } else {
-      return CV_8UC4;
-    }
-  } else if (type == FDDataType::FP32) {
-    if (channel == 1) {
-      return CV_32FC1;
-    } else if (channel == 3) {
-      return CV_32FC3;
-    } else {
-      return CV_32FC4;
-    }
-  }
-  FDASSERT(false, "Data type of %s is not supported.", Str(type).c_str());
-  return CV_32FC3;
-}
-
 fcv::Mat ConvertOpenCVMatToFlyCV(cv::Mat& im) {
   int type = im.type() % 8;
   // 0: uint8; 5: float32; 6: float64
@@ -186,5 +186,96 @@ cv::Mat ConvertFlyCVMatToOpenCV(fcv::Mat& fim) {
 }
 #endif
 
+cv::Mat CreateZeroCopyOpenCVMatFromBuffer(
+  int height, int width, int channels, 
+  FDDataType type, void* data) {
+  cv::Mat ocv_mat;
+  switch (type) {
+    case FDDataType::UINT8:
+      ocv_mat = cv::Mat(height, width, CV_8UC(channels), data);
+      break;
+    case FDDataType::INT8:
+      ocv_mat = cv::Mat(height, width, CV_8SC(channels), data);
+      break;
+    case FDDataType::INT16:
+      ocv_mat = cv::Mat(height, width, CV_16SC(channels), data);
+      break;
+    case FDDataType::INT32:
+      ocv_mat = cv::Mat(height, width, CV_32SC(channels), data);
+      break;
+    case FDDataType::FP32:
+      ocv_mat = cv::Mat(height, width, CV_32FC(channels), data);
+      break;
+    case FDDataType::FP64:
+      ocv_mat = cv::Mat(height, width, CV_64FC(channels), data);
+      break;
+    default:
+      FDASSERT(false,
+               "Tensor type %d is not supported While calling "
+               "CreateZeroCopyOpenCVMat.",
+               type);
+      break;
+  }
+  return ocv_mat;
+}
+
+cv::Mat CreateZeroCopyOpenCVMatFromTensor(const FDTensor& tensor) {
+  // TODO(qiuyanjun): Should add a Layout checking. Now, we 
+  // assume that the input tensor is already in Layout::HWC. 
+  FDASSERT(tensor.shape.size() == 3, "When create OepnCV Mat from tensor,"
+  "tensor shape should be 3-Dim, HWC layout");
+  FDDataType type = tensor.dtype;
+  int height = static_cast<int>(tensor.shape[0]);
+  int width = static_cast<int>(tensor.shape[1]);
+  int channels = static_cast<int>(tensor.shape[2]);
+  return CreateZeroCopyOpenCVMatFromBuffer(
+          height, width, channels, type, 
+          const_cast<void*>(tensor.Data()));
+}
+
+#ifdef ENABLE_FLYCV
+fcv::Mat CreateZeroCopyFlyCVMatFromBuffer(
+  int height, int width, int channels, 
+  FDDataType type, void* data) {
+  fcv::Mat fcv_mat;
+  auto fcv_type = CreateFlyCVDataType(type, channels);
+  switch (type) {
+    case FDDataType::UINT8:
+      fcv_mat =
+        fcv::Mat(width, height, fcv_type, data);
+      break;
+    case FDDataType::FP32:
+      fcv_mat =
+        fcv::Mat(width, height, fcv_type, data);
+      break;
+    case FDDataType::FP64:
+      fcv_mat =
+        fcv::Mat(width, height, fcv_type, data);
+    break;
+    default:
+      FDASSERT(false,
+              "Tensor type %d is not supported While calling "
+              "CreateZeroCopyFlyCVMat.",
+               type);
+    break;
+  }
+  return fcv_mat;
+}
+
+fcv::Mat CreateZeroCopyFlyCVMatFromTensor(const FDTensor& tensor) {
+  // TODO(qiuyanjun): Should add a Layout checking. Now, we 
+  // assume that the input tensor is already in Layout::HWC. 
+  FDASSERT(tensor.shape.size() == 3, "When create FlyCV Mat from tensor,"
+  "tensor shape should be 3-Dim, HWC layout");
+  FDDataType type = tensor.dtype;
+  int height = static_cast<int>(tensor.shape[0]);
+  int width = static_cast<int>(tensor.shape[1]);
+  int channels = static_cast<int>(tensor.shape[2]);
+  return CreateZeroCopyFlyCVMatFromBuffer(
+          height, width, channels, type, 
+          const_cast<void*>(tensor.Data()));
+}
+#endif
+
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/utils.h b/fastdeploy/vision/common/processors/utils.h
index 3b3cfc40dc..50074c3a0c 100644
--- a/fastdeploy/vision/common/processors/utils.h
+++ b/fastdeploy/vision/common/processors/utils.h
@@ -29,7 +29,6 @@ namespace vision {
 FDDataType OpenCVDataTypeToFD(int type);
 // Create data type of opencv by FDDataType
 int CreateOpenCVDataType(FDDataType type, int channel = 1);
-
 #ifdef ENABLE_FLYCV
 // Convert data type of flycv to FDDataType
 FDDataType FlyCVDataTypeToFD(fcv::FCVImageType type);
@@ -41,5 +40,14 @@ fcv::Mat ConvertOpenCVMatToFlyCV(cv::Mat& im);
 cv::Mat ConvertFlyCVMatToOpenCV(fcv::Mat& fim);
 #endif
 
+// Create zero copy OpenCV/FlyCV Mat from FD Tensor / Buffer
+cv::Mat CreateZeroCopyOpenCVMatFromBuffer(int height, int width,
+  int channels, FDDataType type, void* data);
+cv::Mat CreateZeroCopyOpenCVMatFromTensor(const FDTensor& tensor);
+#ifdef ENABLE_FLYCV
+fcv::Mat CreateZeroCopyFlyCVMatFromBuffer(int height, int width,
+  int channels, FDDataType type, void* data);
+fcv::Mat CreateZeroCopyFlyCVMatFromTensor(const FDTensor& tensor);
+#endif
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/matting/contrib/modnet.cc b/fastdeploy/vision/matting/contrib/modnet.cc
index b0bc59c66f..c3a89733dd 100644
--- a/fastdeploy/vision/matting/contrib/modnet.cc
+++ b/fastdeploy/vision/matting/contrib/modnet.cc
@@ -104,11 +104,11 @@ bool MODNet::Postprocess(
   int ipt_w = iter_ipt->second[1];
 
   float* alpha_ptr = static_cast<float*>(alpha_tensor.Data());
-  cv::Mat alpha_zero_copy_ref(out_h, out_w, CV_32FC1, alpha_ptr);
-  Mat alpha_resized(alpha_zero_copy_ref);  // ref-only, zero copy.
+  // cv::Mat alpha_zero_copy_ref(out_h, out_w, CV_32FC1, alpha_ptr);
+  // Mat alpha_resized(alpha_zero_copy_ref);  // ref-only, zero copy.
+  Mat alpha_resized = Mat::Create(out_h, out_w, 1, FDDataType::FP32, 
+                                  alpha_ptr); // ref-only, zero copy.
   if ((out_h != ipt_h) || (out_w != ipt_w)) {
-    // already allocated a new continuous memory after resize.
-    // cv::resize(alpha_resized, alpha_resized, cv::Size(ipt_w, ipt_h));
     Resize::Run(&alpha_resized, ipt_w, ipt_h, -1, -1);
   }
 
diff --git a/fastdeploy/vision/matting/ppmatting/ppmatting.cc b/fastdeploy/vision/matting/ppmatting/ppmatting.cc
index cde2183075..cf999cd0ed 100644
--- a/fastdeploy/vision/matting/ppmatting/ppmatting.cc
+++ b/fastdeploy/vision/matting/ppmatting/ppmatting.cc
@@ -172,7 +172,7 @@ bool PPMatting::Postprocess(
   std::vector<int64_t> dim{0, 2, 3, 1};
   Transpose(alpha_tensor, &alpha_tensor, dim);
   alpha_tensor.Squeeze(0);
-  Mat mat = CreateFDMatFromTensor(alpha_tensor);
+  Mat mat = Mat::Create(alpha_tensor);
 
   auto iter_ipt = im_info.find("input_shape");
   auto iter_out = im_info.find("output_shape");
diff --git a/fastdeploy/vision/segmentation/ppseg/model.cc b/fastdeploy/vision/segmentation/ppseg/model.cc
index 58819ef15c..fccdf3688d 100644
--- a/fastdeploy/vision/segmentation/ppseg/model.cc
+++ b/fastdeploy/vision/segmentation/ppseg/model.cc
@@ -263,7 +263,7 @@ bool PaddleSegModel::Postprocess(
           infer_result->shape, FDDataType::FP32,
           static_cast<void*>(fp32_result_buffer->data()));
     }
-    mat = new Mat(CreateFDMatFromTensor(*infer_result));
+    mat = new Mat(Mat::Create(*infer_result));
     Resize::Run(mat, ipt_w, ipt_h, -1.0f, -1.0f, 1);
     mat->ShareWithTensor(&new_infer_result);
     result->shape = new_infer_result.shape;