diff --git a/.github/workflows/xpu_ci.yml b/.github/workflows/xpu_ci.yml
new file mode 100644
index 0000000000..cac51fafe9
--- /dev/null
+++ b/.github/workflows/xpu_ci.yml
@@ -0,0 +1,75 @@
+name: CI_XPU
+
+on:
+  pull_request:
+    branches:
+      - develop
+    paths-ignore:
+      - '**.md'
+      - '**.txt'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.event.pull_request.number }}-xpu-ci
+  cancel-in-progress: true
+
+jobs:
+  CI_XPU:
+    timeout-minutes: 60
+    runs-on: [self-hosted, XPU-P800-2Card]
+    steps:
+      - name: Print current runner name
+        run: |
+          echo "Current runner name: ${{ runner.name }}"
+      # Because the system version is lower than 2.23, the checkout cannot be used.
+      # - name: Checkout code
+      #   uses: actions/checkout@v4
+
+      - name: Code Checkout
+        env:
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.3.0
+        run: |
+          REPO="https://github.com/${{ github.repository }}.git"
+          FULL_REPO="${{ github.repository }}"
+          REPO_NAME="${FULL_REPO##*/}"
+          BASE_BRANCH="${{ github.base_ref }}"
+          # Clean the repository directory before starting
+          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+          -e "REPO_NAME=${REPO_NAME}" \
+          -e "BASE_BRANCH=${BASE_BRANCH}" \
+          ${docker_image} /bin/bash -c '
+            if [ -d ${REPO_NAME} ]; then
+              echo "Directory ${REPO_NAME} exists, removing it..."
+              rm -rf ${REPO_NAME}
+            fi
+          '
+          git config --global user.name "PaddleCI"
+          git config --global user.email "paddle_ci@example.com"
+          git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
+          cd PaddleX
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
+            git merge pr/${{ github.event.pull_request.number }}
+            git log -n 3 --oneline
+          else
+            git checkout ${{ github.sha }}
+            git log -n 3 --oneline
+          fi
+
+      - name: Run CI unittest
+        env:
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.3.0
+        run: |
+          runner_name="${{ runner.name }}"
+          PARENT_DIR=$(dirname "$WORKSPACE")
+          echo "PARENT_DIR:$PARENT_DIR"
+          docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G  \
+          -v $(pwd):/workspace -w /workspace \
+          -e "http_proxy=$(git config --global --get http.proxy)" \
+          -e "https_proxy=$(git config --global --get https.proxy)" \
+          -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \
+           ${docker_image} /bin/bash -c "
+          git config --global --add safe.directory /workspace/PaddleX
+          cd PaddleX
+          bash tests/run_xpu_ci.sh
+          "
diff --git a/.precommit/check_imports.py b/.precommit/check_imports.py
index 905f430532..7f9ec4a349 100644
--- a/.precommit/check_imports.py
+++ b/.precommit/check_imports.py
@@ -49,6 +49,7 @@
     "GPUtil": "GPUtil",
     "huggingface_hub": "huggingface-hub",
     "imagesize": "imagesize",
+    "jieba": "jieba",
     "jinja2": "Jinja2",
     "joblib": "joblib",
     "langchain": "langchain",
@@ -60,6 +61,7 @@
     "modelscope": "modelscope",
     "numpy": "numpy",
     "openai": "openai",
+    "opencc": "OpenCC",
     "cv2": "opencv-contrib-python",
     "openpyxl": "openpyxl",
     "packaging": "packaging",
@@ -73,11 +75,13 @@
     "pycocotools": "pycocotools",
     "pydantic": "pydantic",
     "pypdfium2": "pypdfium2",
+    "pypinyin": "pypinyin",
     "yaml": "PyYAML",
     "regex": "regex",
     "requests": "requests",
     "ruamel.yaml": "ruamel.yaml",
     "safetensors": "safetensors",
+    "scipy": "scipy",
     "skimage": "scikit-image",
     "sklearn": "scikit-learn",
     "sentencepiece": "sentencepiece",
@@ -120,6 +124,7 @@
     "paddle_custom_device",
     "ultra_infer",
     "fastdeploy",
+    "onnxruntime",
 }
 
 
diff --git a/api_examples/pipelines/test_pp_structure_v3.py b/api_examples/pipelines/test_pp_structure_v3.py
index 5e69b7cf91..69da85f7de 100644
--- a/api_examples/pipelines/test_pp_structure_v3.py
+++ b/api_examples/pipelines/test_pp_structure_v3.py
@@ -21,7 +21,7 @@
     use_doc_orientation_classify=False,
     use_doc_unwarping=False,
     use_common_ocr=True,
-    use_seal_recognition=True,
+    use_seal_recognition=False,
     use_table_recognition=True,
 )
 
diff --git a/api_examples/pipelines/test_text_to_speech.py b/api_examples/pipelines/test_text_to_speech.py
new file mode 100644
index 0000000000..7eeb47c453
--- /dev/null
+++ b/api_examples/pipelines/test_text_to_speech.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="text_to_speech")
+
+output = pipeline.predict(
+    "根据您的情况，建议低盐饮食配合轻度活动，已为您推荐了健康的食谱"
+)
+
+for res in output:
+    print(res)
+    res.print()
+    res.save_to_audio("./output/test.wav")
+    res.save_to_json("./output")
diff --git a/deploy/genai_vllm_server_docker/Dockerfile b/deploy/genai_vllm_server_docker/Dockerfile
index efad42ebe0..84aa4206fd 100644
--- a/deploy/genai_vllm_server_docker/Dockerfile
+++ b/deploy/genai_vllm_server_docker/Dockerfile
@@ -8,14 +8,17 @@ ENV PIP_NO_CACHE_DIR=0
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONDONTWRITEBYTECODE=1
 
+RUN python -m pip install torch==2.8.0
+
 ARG PADDLEX_VERSION=">=3.3.6,<3.4"
 RUN python -m pip install "paddlex${PADDLEX_VERSION}"
 
 ARG BUILD_FOR_SM120=false
 RUN if [ "${BUILD_FOR_SM120}" = 'true' ]; then \
-        python -m pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.14/flash_attn-2.8.3+cu128torch2.8-cp310-cp310-linux_x86_64.whl \
+        python -m pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.11/flash_attn-2.8.3%2Bcu128torch2.8-cp310-cp310-linux_x86_64.whl; \
     else \
-        python -m pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.14/flash_attn-2.8.2+cu128torch2.8-cp310-cp310-linux_x86_64.whl \
+        python -m pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.14/flash_attn-2.8.2+cu128torch2.8-cp310-cp310-linux_x86_64.whl; \
+    fi \
     && paddlex --install genai-vllm-server
 
 EXPOSE 8080
diff --git a/deploy/genai_vllm_server_docker/build.sh b/deploy/genai_vllm_server_docker/build.sh
index 74019f5c13..f0c8773a27 100755
--- a/deploy/genai_vllm_server_docker/build.sh
+++ b/deploy/genai_vllm_server_docker/build.sh
@@ -21,8 +21,8 @@ while [[ $# -gt 0 ]]; do
             shift
             ;;
         *)
-            echo "Unknown option: $1"
-            exit 1
+            echo "Unknown option: $1" >&2
+            exit 2
             ;;
     esac
 done
diff --git a/deploy/hps/sdk/common/server.sh b/deploy/hps/sdk/common/server.sh
index a14dcb13fb..4d543de823 100755
--- a/deploy/hps/sdk/common/server.sh
+++ b/deploy/hps/sdk/common/server.sh
@@ -14,8 +14,12 @@ rm -rf "${MODEL_REPO_DIR}"
 cp -r model_repo "${MODEL_REPO_DIR}"
 
 find "${MODEL_REPO_DIR}" -mindepth 1 -maxdepth 1 -type d -print0 | while IFS= read -r -d '' dir_; do
-    if [ -f "${dir_}/config_${PADDLEX_HPS_DEVICE_TYPE}.pbtxt" ]; then
-        cp -f "${dir_}/config_${PADDLEX_HPS_DEVICE_TYPE}.pbtxt" "${dir_}/config.pbtxt"
+    if [ ! -f "${dir_}/config.pbtxt" ]; then
+        if [ "${PADDLEX_HPS_DEVICE_TYPE:-gpu}" = 'gpu' ]; then
+            cp -f "${dir_}/config_gpu.pbtxt" "${dir_}/config.pbtxt"
+        else
+            cp -f "${dir_}/config_cpu.pbtxt" "${dir_}/config.pbtxt"
+        fi
     fi
 done
 
diff --git a/deploy/hps/sdk/pipelines/3d_bev_detection/version.txt b/deploy/hps/sdk/pipelines/3d_bev_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/3d_bev_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py b/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py
index a3e8facab9..7a99bf9829 100644
--- a/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py
+++ b/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py
@@ -20,6 +20,7 @@
 from paddlex_hps_server import (
     BaseTritonPythonModel,
     app_common,
+    logging,
     protocol,
     schemas,
     utils,
@@ -167,10 +168,13 @@ def run_batch(self, inputs, log_ids, batch_id):
 
     def _group_inputs(self, inputs):
         def _to_hashable(obj):
-            if isinstance(obj, list):
-                return tuple(obj)
-            elif isinstance(obj, dict):
-                return tuple(sorted(obj.items()))
+            if isinstance(obj, dict):
+                return tuple(
+                    (_to_hashable(k), _to_hashable(v))
+                    for k, v in sorted(obj.items(), key=lambda x: repr(x[0]))
+                )
+            elif isinstance(obj, list):
+                return tuple(_to_hashable(x) for x in obj)
             else:
                 return obj
 
@@ -231,12 +235,20 @@ def _preprocess(self, input, log_id):
             else self.app_config.visualize
         )
 
-        file_bytes = utils.get_raw_bytes(input.file)
-        images, data_info = utils.file_to_images(
-            file_bytes,
-            file_type,
-            max_num_imgs=self.context["max_num_input_imgs"],
-        )
+        try:
+            file_bytes = utils.get_raw_bytes(input.file)
+            images, data_info = utils.file_to_images(
+                file_bytes,
+                file_type,
+                max_num_imgs=self.context["max_num_input_imgs"],
+            )
+        except Exception as e:
+            logging.error("Failed to get input file bytes: %s", e)
+            return protocol.create_aistudio_output_without_result(
+                422,
+                "Input file is invalid",
+                log_id=log_id,
+            )
 
         return images, data_info, visualize_enabled
 
diff --git a/deploy/hps/sdk/pipelines/OCR/version.txt b/deploy/hps/sdk/pipelines/OCR/version.txt
new file mode 100644
index 0000000000..3a4036fb45
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/OCR/version.txt
@@ -0,0 +1 @@
+0.2.5
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/version.txt b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/version.txt
new file mode 100644
index 0000000000..d15723fbe8
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/version.txt
@@ -0,0 +1 @@
+0.3.2
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/version.txt b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/version.txt
new file mode 100644
index 0000000000..2b7c5ae018
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/version.txt
@@ -0,0 +1 @@
+0.4.2
diff --git a/deploy/hps/sdk/pipelines/PP-DocTranslation/version.txt b/deploy/hps/sdk/pipelines/PP-DocTranslation/version.txt
new file mode 100644
index 0000000000..d917d3e26a
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PP-DocTranslation/version.txt
@@ -0,0 +1 @@
+0.1.2
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/version.txt b/deploy/hps/sdk/pipelines/PP-ShiTuV2/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py b/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py
index 6ccf3c2b2f..f7e8d9b56b 100644
--- a/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py
@@ -19,6 +19,7 @@
 from paddlex_hps_server import (
     BaseTritonPythonModel,
     app_common,
+    logging,
     protocol,
     schemas,
     utils,
@@ -163,6 +164,7 @@ def run_batch(self, inputs, log_ids, batch_id):
                             use_e2e_wireless_table_rec_model=inputs_g[
                                 0
                             ].useE2eWirelessTableRecModel,
+                            markdown_ignore_labels=inputs_g[0].markdownIgnoreLabels,
                         )
                     )
 
@@ -199,10 +201,13 @@ def run_batch(self, inputs, log_ids, batch_id):
 
     def _group_inputs(self, inputs):
         def _to_hashable(obj):
-            if isinstance(obj, list):
-                return tuple(obj)
-            elif isinstance(obj, dict):
-                return tuple(sorted(obj.items()))
+            if isinstance(obj, dict):
+                return tuple(
+                    (_to_hashable(k), _to_hashable(v))
+                    for k, v in sorted(obj.items(), key=lambda x: repr(x[0]))
+                )
+            elif isinstance(obj, list):
+                return tuple(_to_hashable(x) for x in obj)
             else:
                 return obj
 
@@ -243,6 +248,7 @@ def _hash(input):
                             input.useOcrResultsWithTableCells,
                             input.useE2eWiredTableRecModel,
                             input.useE2eWirelessTableRecModel,
+                            input.markdownIgnoreLabels,
                         ),
                     )
                 )
@@ -284,12 +290,20 @@ def _preprocess(self, input, log_id):
             else self.app_config.visualize
         )
 
-        file_bytes = utils.get_raw_bytes(input.file)
-        images, data_info = utils.file_to_images(
-            file_bytes,
-            file_type,
-            max_num_imgs=self.context["max_num_input_imgs"],
-        )
+        try:
+            file_bytes = utils.get_raw_bytes(input.file)
+            images, data_info = utils.file_to_images(
+                file_bytes,
+                file_type,
+                max_num_imgs=self.context["max_num_input_imgs"],
+            )
+        except Exception as e:
+            logging.error("Failed to get input file bytes: %s", e)
+            return protocol.create_aistudio_output_without_result(
+                422,
+                "Input file is invalid",
+                log_id=log_id,
+            )
 
         return images, data_info, visualize_enabled
 
@@ -297,7 +311,11 @@ def _postprocess(self, images, data_info, visualize_enabled, preds, log_id, inpu
         layout_parsing_results: List[Dict[str, Any]] = []
         for i, (img, item) in enumerate(zip(images, preds)):
             pruned_res = app_common.prune_result(item.json["res"])
-            md_data = item.markdown
+            # XXX
+            md_data = item._to_markdown(
+                pretty=input.prettifyMarkdown,
+                show_formula_number=input.showFormulaNumber,
+            )
             md_text = md_data["markdown_texts"]
             md_imgs = app_common.postprocess_images(
                 md_data["markdown_images"],
diff --git a/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml b/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml
index a93952771a..b45d0d0d89 100644
--- a/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml
+++ b/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml
@@ -11,6 +11,15 @@ use_chart_recognition: False
 use_region_detection: True
 format_block_content: False
 
+markdown_ignore_labels:
+  - number
+  - footnote
+  - header
+  - header_image
+  - footer
+  - footer_image
+  - aside_text
+
 SubModules:
   LayoutDetection:
     module_name: layout_detection
diff --git a/deploy/hps/sdk/pipelines/PP-StructureV3/version.txt b/deploy/hps/sdk/pipelines/PP-StructureV3/version.txt
new file mode 100644
index 0000000000..c2c0004f0e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PP-StructureV3/version.txt
@@ -0,0 +1 @@
+0.3.5
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py
index 2e8e008e1e..f0b0a64ac3 100644
--- a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py
@@ -19,6 +19,7 @@
 from paddlex_hps_server import (
     BaseTritonPythonModel,
     app_common,
+    logging,
     protocol,
     schemas,
     utils,
@@ -124,10 +125,13 @@ def run_batch(self, inputs, log_ids, batch_id):
                             use_doc_unwarping=inputs_g[0].useDocUnwarping,
                             use_layout_detection=inputs_g[0].useLayoutDetection,
                             use_chart_recognition=inputs_g[0].useChartRecognition,
+                            use_seal_recognition=inputs_g[0].useSealRecognition,
+                            use_ocr_for_image_block=inputs_g[0].useOcrForImageBlock,
                             layout_threshold=inputs_g[0].layoutThreshold,
                             layout_nms=inputs_g[0].layoutNms,
                             layout_unclip_ratio=inputs_g[0].layoutUnclipRatio,
                             layout_merge_bboxes_mode=inputs_g[0].layoutMergeBboxesMode,
+                            layout_shape_mode=inputs_g[0].layoutShapeMode,
                             prompt_label=inputs_g[0].promptLabel,
                             format_block_content=inputs_g[0].formatBlockContent,
                             repetition_penalty=inputs_g[0].repetitionPenalty,
@@ -135,6 +139,10 @@ def run_batch(self, inputs, log_ids, batch_id):
                             top_p=inputs_g[0].topP,
                             min_pixels=inputs_g[0].minPixels,
                             max_pixels=inputs_g[0].maxPixels,
+                            max_new_tokens=inputs_g[0].maxNewTokens,
+                            merge_layout_blocks=inputs_g[0].mergeLayoutBlocks,
+                            markdown_ignore_labels=inputs_g[0].markdownIgnoreLabels,
+                            vlm_extra_args=inputs_g[0].vlmExtraArgs,
                         )
                     )
 
@@ -171,10 +179,13 @@ def run_batch(self, inputs, log_ids, batch_id):
 
     def _group_inputs(self, inputs):
         def _to_hashable(obj):
-            if isinstance(obj, list):
-                return tuple(obj)
-            elif isinstance(obj, dict):
-                return tuple(sorted(obj.items()))
+            if isinstance(obj, dict):
+                return tuple(
+                    (_to_hashable(k), _to_hashable(v))
+                    for k, v in sorted(obj.items(), key=lambda x: repr(x[0]))
+                )
+            elif isinstance(obj, list):
+                return tuple(_to_hashable(x) for x in obj)
             else:
                 return obj
 
@@ -189,10 +200,13 @@ def _hash(input):
                                 input.useDocUnwarping,
                                 input.useLayoutDetection,
                                 input.useChartRecognition,
+                                input.useSealRecognition,
+                                input.useOcrForImageBlock,
                                 input.layoutThreshold,
                                 input.layoutNms,
                                 input.layoutUnclipRatio,
                                 input.layoutMergeBboxesMode,
+                                input.layoutShapeMode,
                                 input.promptLabel,
                                 input.formatBlockContent,
                                 input.repetitionPenalty,
@@ -200,6 +214,10 @@ def _hash(input):
                                 input.topP,
                                 input.minPixels,
                                 input.maxPixels,
+                                input.maxNewTokens,
+                                input.mergeLayoutBlocks,
+                                input.markdownIgnoreLabels,
+                                input.vlmExtraArgs,
                             )
                         ),
                     )
@@ -242,16 +260,32 @@ def _preprocess(self, input, log_id):
             else self.app_config.visualize
         )
 
-        file_bytes = utils.get_raw_bytes(input.file)
-        images, data_info = utils.file_to_images(
-            file_bytes,
-            file_type,
-            max_num_imgs=self.context["max_num_input_imgs"],
-        )
+        try:
+            file_bytes = utils.get_raw_bytes(input.file)
+            images, data_info = utils.file_to_images(
+                file_bytes,
+                file_type,
+                max_num_imgs=self.context["max_num_input_imgs"],
+            )
+        except Exception as e:
+            logging.error("Failed to get input file bytes: %s", e)
+            return protocol.create_aistudio_output_without_result(
+                422,
+                "Input file is invalid",
+                log_id=log_id,
+            )
 
         return images, data_info, visualize_enabled
 
     def _postprocess(self, images, data_info, visualize_enabled, preds, log_id, input):
+        if input.restructurePages:
+            preds = self.pipeline.restructure_pages(
+                preds,
+                merge_tables=input.mergeTables,
+                relevel_titles=input.relevelTitles,
+                concatenate_pages=False,
+            )
+            preds = list(preds)
         layout_parsing_results: List[Dict[str, Any]] = []
         for i, (img, item) in enumerate(zip(images, preds)):
             pruned_res = app_common.prune_result(item.json["res"])
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/restructure-pages/1/model.py b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/restructure-pages/1/model.py
new file mode 100644
index 0000000000..c36fd76a10
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/restructure-pages/1/model.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlex_hps_server import (
+    BaseTritonPythonModel,
+    app_common,
+    schemas,
+)
+
+
+class TritonPythonModel(BaseTritonPythonModel):
+    @property
+    def pipeline_creation_kwargs(self):
+        return {"initial_predictor": False}
+
+    def get_input_model_type(self):
+        return schemas.paddleocr_vl.RestructurePagesRequest
+
+    def get_result_model_type(self):
+        return schemas.paddleocr_vl.RestructurePagesResult
+
+    def run(self, input, log_id):
+        def _to_original_result(pruned_res, page_index):
+            res = {**pruned_res, "input_path": "", "page_index": page_index}
+            orig_res = {"res": res}
+            return orig_res
+
+        original_results = []
+        markdown_images = {}
+        for i, page in enumerate(input.pages):
+            orig_res = _to_original_result(page.prunedResult, i)
+            original_results.append(orig_res)
+            if input.concatenatePages:
+                markdown_images.update(page.markdownImages)
+
+        restructured_results = self.pipeline.restructure_pages(
+            original_results,
+            merge_tables=input.mergeTables,
+            relevel_titles=input.relevelTitles,
+            concatenate_pages=input.concatenatePages,
+        )
+        restructured_results = list(restructured_results)
+
+        layout_parsing_results = []
+        if input.concatenatePages:
+            layout_parsing_result = {}
+            layout_parsing_result["prunedResult"] = app_common.prune_result(
+                restructured_results[0].json["res"]
+            )
+            # XXX
+            md_data = restructured_results[0]._to_markdown(
+                pretty=input.prettifyMarkdown,
+                show_formula_number=input.showFormulaNumber,
+            )
+            layout_parsing_result["markdown"] = dict(
+                text=md_data["markdown_texts"],
+                images=markdown_images,
+            )
+            layout_parsing_results.append(layout_parsing_result)
+        else:
+            for new_res, old_page in zip(restructured_results, input.pages):
+                layout_parsing_result = {}
+                layout_parsing_result["prunedResult"] = app_common.prune_result(
+                    new_res.json["res"]
+                )
+                # XXX
+                md_data = new_res._to_markdown(
+                    pretty=input.prettifyMarkdown,
+                    show_formula_number=input.showFormulaNumber,
+                )
+                layout_parsing_result["markdown"] = dict(
+                    text=md_data["markdown_texts"],
+                    images=old_page.markdownImages,
+                )
+                layout_parsing_results.append(layout_parsing_result)
+
+        return schemas.paddleocr_vl.RestructurePagesResult(
+            layoutParsingResults=layout_parsing_results,
+        )
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/restructure-pages/config.pbtxt b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/restructure-pages/config.pbtxt
new file mode 100644
index 0000000000..b7d640dad0
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/restructure-pages/config.pbtxt
@@ -0,0 +1,22 @@
+backend: "python"
+max_batch_size: 1
+input [
+  {
+    name: "input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "output"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+      count: 1
+      kind: KIND_CPU
+  }
+]
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml
index 280a454604..ebf5804d29 100644
--- a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml
@@ -9,6 +9,15 @@ use_doc_preprocessor: False
 use_layout_detection: True
 use_chart_recognition: False
 format_block_content: False
+merge_layout_blocks: True
+markdown_ignore_labels:
+  - number
+  - footnote
+  - header
+  - header_image
+  - footer
+  - footer_image
+  - aside_text
 
 SubModules:
   LayoutDetection:
@@ -74,7 +83,7 @@ SubModules:
     module_name: vl_recognition
     model_name: PaddleOCR-VL-0.9B
     model_dir: null
-    batch_size: 4096
+    batch_size: -1
     genai_config:
       backend: native
 
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/version.txt b/deploy/hps/sdk/pipelines/PaddleOCR-VL/version.txt
new file mode 100644
index 0000000000..0ea3a944b3
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/version.txt
@@ -0,0 +1 @@
+0.2.0
diff --git a/deploy/hps/sdk/pipelines/anomaly_detection/version.txt b/deploy/hps/sdk/pipelines/anomaly_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/anomaly_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/doc_preprocessor/version.txt b/deploy/hps/sdk/pipelines/doc_preprocessor/version.txt
new file mode 100644
index 0000000000..0c62199f16
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/doc_preprocessor/version.txt
@@ -0,0 +1 @@
+0.2.1
diff --git a/deploy/hps/sdk/pipelines/doc_understanding/version.txt b/deploy/hps/sdk/pipelines/doc_understanding/version.txt
new file mode 100644
index 0000000000..d917d3e26a
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/doc_understanding/version.txt
@@ -0,0 +1 @@
+0.1.2
diff --git a/deploy/hps/sdk/pipelines/face_recognition/version.txt b/deploy/hps/sdk/pipelines/face_recognition/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/face_recognition/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/formula_recognition/version.txt b/deploy/hps/sdk/pipelines/formula_recognition/version.txt
new file mode 100644
index 0000000000..0c62199f16
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/formula_recognition/version.txt
@@ -0,0 +1 @@
+0.2.1
diff --git a/deploy/hps/sdk/pipelines/human_keypoint_detection/version.txt b/deploy/hps/sdk/pipelines/human_keypoint_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/human_keypoint_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/image_classification/version.txt b/deploy/hps/sdk/pipelines/image_classification/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/image_classification/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/image_multilabel_classification/version.txt b/deploy/hps/sdk/pipelines/image_multilabel_classification/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/image_multilabel_classification/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/instance_segmentation/version.txt b/deploy/hps/sdk/pipelines/instance_segmentation/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/instance_segmentation/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/layout_parsing/version.txt b/deploy/hps/sdk/pipelines/layout_parsing/version.txt
new file mode 100644
index 0000000000..9e11b32fca
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/layout_parsing/version.txt
@@ -0,0 +1 @@
+0.3.1
diff --git a/deploy/hps/sdk/pipelines/multilingual_speech_recognition/version.txt b/deploy/hps/sdk/pipelines/multilingual_speech_recognition/version.txt
new file mode 100644
index 0000000000..0c62199f16
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/multilingual_speech_recognition/version.txt
@@ -0,0 +1 @@
+0.2.1
diff --git a/deploy/hps/sdk/pipelines/object_detection/version.txt b/deploy/hps/sdk/pipelines/object_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/object_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/open_vocabulary_detection/version.txt b/deploy/hps/sdk/pipelines/open_vocabulary_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/open_vocabulary_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/version.txt b/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/version.txt b/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/rotated_object_detection/version.txt b/deploy/hps/sdk/pipelines/rotated_object_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/rotated_object_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/seal_recognition/version.txt b/deploy/hps/sdk/pipelines/seal_recognition/version.txt
new file mode 100644
index 0000000000..ee1372d33a
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/seal_recognition/version.txt
@@ -0,0 +1 @@
+0.2.2
diff --git a/deploy/hps/sdk/pipelines/semantic_segmentation/version.txt b/deploy/hps/sdk/pipelines/semantic_segmentation/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/semantic_segmentation/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/small_object_detection/version.txt b/deploy/hps/sdk/pipelines/small_object_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/small_object_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/table_recognition/version.txt b/deploy/hps/sdk/pipelines/table_recognition/version.txt
new file mode 100644
index 0000000000..267577d47e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/table_recognition/version.txt
@@ -0,0 +1 @@
+0.4.1
diff --git a/deploy/hps/sdk/pipelines/table_recognition_v2/version.txt b/deploy/hps/sdk/pipelines/table_recognition_v2/version.txt
new file mode 100644
index 0000000000..267577d47e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/table_recognition_v2/version.txt
@@ -0,0 +1 @@
+0.4.1
diff --git a/deploy/hps/sdk/pipelines/ts_anomaly_detection/version.txt b/deploy/hps/sdk/pipelines/ts_anomaly_detection/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/ts_anomaly_detection/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/ts_classification/version.txt b/deploy/hps/sdk/pipelines/ts_classification/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/ts_classification/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/ts_forecast/version.txt b/deploy/hps/sdk/pipelines/ts_forecast/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/ts_forecast/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/version.txt b/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/version.txt
new file mode 100644
index 0000000000..17e51c385e
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/version.txt
@@ -0,0 +1 @@
+0.1.1
diff --git a/deploy/hps/sdk/pipelines/video_classification/version.txt b/deploy/hps/sdk/pipelines/video_classification/version.txt
new file mode 100644
index 0000000000..0c62199f16
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/video_classification/version.txt
@@ -0,0 +1 @@
+0.2.1
diff --git a/deploy/hps/sdk/pipelines/video_detection/version.txt b/deploy/hps/sdk/pipelines/video_detection/version.txt
new file mode 100644
index 0000000000..0c62199f16
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/video_detection/version.txt
@@ -0,0 +1 @@
+0.2.1
diff --git a/deploy/hps/sdk/scripts/assemble.py b/deploy/hps/sdk/scripts/assemble.py
index bf40dab9d8..0354108334 100755
--- a/deploy/hps/sdk/scripts/assemble.py
+++ b/deploy/hps/sdk/scripts/assemble.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 
 import argparse
-import json
 import pathlib
 import shutil
 import subprocess
@@ -30,7 +29,6 @@
 PIPELINES_DIR = BASE_DIR / "pipelines"
 COMMON_DIR = BASE_DIR / "common"
 CLIENT_LIB_PATH = BASE_DIR / "paddlex-hps-client"
-VERSIONS_PATH = BASE_DIR / "versions.json"
 OUTPUT_DIR = BASE_DIR / "output"
 
 
@@ -88,17 +86,12 @@
             )
             client_lib_whl_path = next(OUTPUT_DIR.glob("paddlex_hps_client*.whl"))
 
-    with VERSIONS_PATH.open("r", encoding="utf-8") as f:
-        versions = json.load(f)
-
     for pipeline_name in pipeline_names:
         print("=" * 30)
         print(f"Pipeline: {pipeline_name}")
         pipeline_dir = PIPELINES_DIR / pipeline_name
         if not pipeline_dir.exists():
             sys.exit(f"{pipeline_dir} not found")
-        if pipeline_name not in versions:
-            sys.exit(f"Version is missing for {repr(pipeline_name)}")
 
         tgt_name = TARGET_NAME_PATTERN.format(pipeline_name=pipeline_name)
         tgt_dir = OUTPUT_DIR / tgt_name
@@ -125,8 +118,7 @@
             shutil.copytree(pipeline_dir / "client", tgt_dir / "client")
             shutil.copy(client_lib_whl_path, tgt_dir / "client")
 
-        version = versions[pipeline_name]
-        (tgt_dir / "version.txt").write_text(version + "\n", encoding="utf-8")
+        shutil.copy(pipeline_dir / "version.txt", tgt_dir / "version.txt")
 
         arch_path = tgt_dir.with_suffix(ARCHIVE_SUFFIX)
         print(f"Creating archive: {arch_path}")
diff --git a/deploy/hps/sdk/scripts/assemble.sh b/deploy/hps/sdk/scripts/assemble.sh
index 6c4257443c..9926d45779 100755
--- a/deploy/hps/sdk/scripts/assemble.sh
+++ b/deploy/hps/sdk/scripts/assemble.sh
@@ -4,7 +4,6 @@ docker run \
     -it \
     -e OUID="$(id -u)" \
     -e OGID="$(id -g)" \
-    -e PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple \
     -v "$(pwd)":/workspace \
     -w /workspace \
     --rm \
diff --git a/deploy/hps/sdk/versions.json b/deploy/hps/sdk/versions.json
deleted file mode 100644
index 1ed22121bf..0000000000
--- a/deploy/hps/sdk/versions.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-    "3d_bev_detection": "0.1.0",
-    "anomaly_detection": "0.1.0",
-    "doc_preprocessor": "0.2.0",
-    "doc_understanding": "0.1.1",
-    "face_recognition": "0.1.0",
-    "formula_recognition": "0.2.0",
-    "human_keypoint_detection": "0.1.0",
-    "image_classification": "0.1.0",
-    "image_multilabel_classification": "0.1.0",
-    "instance_segmentation": "0.1.0",
-    "layout_parsing": "0.3.0",
-    "PP-StructureV3": "0.3.2",
-    "multilingual_speech_recognition": "0.2.0",
-    "object_detection": "0.1.0",
-    "OCR": "0.2.3",
-    "open_vocabulary_detection": "0.1.0",
-    "open_vocabulary_segmentation": "0.1.0",
-    "pedestrian_attribute_recognition": "0.1.0",
-    "PP-ChatOCRv3-doc": "0.3.1",
-    "PP-ChatOCRv4-doc": "0.4.1",
-    "PP-DocTranslation": "0.1.1",
-    "PaddleOCR-VL": "0.1.1",
-    "PP-ShiTuV2": "0.1.0",
-    "rotated_object_detection": "0.1.0",
-    "seal_recognition": "0.2.1",
-    "semantic_segmentation": "0.1.0",
-    "small_object_detection": "0.1.0",
-    "table_recognition": "0.4.0",
-    "table_recognition_v2": "0.4.0",
-    "ts_anomaly_detection": "0.1.0",
-    "ts_classification": "0.1.0",
-    "ts_forecast": "0.1.0",
-    "vehicle_attribute_recognition": "0.1.0",
-    "video_classification": "0.2.0",
-    "video_detection": "0.2.0"
-}
diff --git a/deploy/hps/server_env/Dockerfile b/deploy/hps/server_env/Dockerfile
index 1038a12d7f..28cb85c0a6 100644
--- a/deploy/hps/server_env/Dockerfile
+++ b/deploy/hps/server_env/Dockerfile
@@ -46,6 +46,7 @@ ENV PYTHONUNBUFFERED=1
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PIP_INDEX_URL=${PIP_INDEX_URL}
 
+RUN python -m pip install pip==25.2
 
 # Requirement collection
 FROM base AS rc
@@ -99,8 +100,7 @@ RUN --mount=type=bind,source=deploy/hps/server_env/requirements/${DEVICE_TYPE}.t
     python -m pip install --requirement /tmp/requirements.txt --requirement /tmp/hpi_requirements.txt \
     && if [ "${ENV_TYPE}" = 'dev' ]; then \
         python -m pip install --requirement /tmp/dev_requirements.txt; \
-    fi \
-    && python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
+    fi
 
 RUN --mount=type=bind,source=.,target=/tmp/PaddleX,rw \
     python -m pip install --no-deps /tmp/PaddleX
@@ -140,6 +140,8 @@ RUN groupadd -g 1000 paddlex \
 # Not sure if all these deps are necessary
 RUN apt-get update \
     && apt-get install -y --no-install-recommends libre2-5 libssl1.1 libb64-0d libnuma1 libarchive13 python3.10-dev libgl1-mesa-glx libglib2.0-0 libgomp1 ccache binutils build-essential texlive texlive-latex-base texlive-latex-extra \
+    && apt-get install -y --no-install-recommends fontconfig fonts-dejavu-core fonts-liberation fonts-noto-cjk fonts-wqy-microhei fonts-freefont-ttf \
+    && fc-cache -fv \
     && rm -rf /var/lib/apt/lists/*
 
 COPY --from=build3 --chown=paddlex:paddlex /paddlex/libs /paddlex/libs
diff --git a/deploy/hps/server_env/cpu_version.txt b/deploy/hps/server_env/cpu_version.txt
index 940ac09aa6..e4737652ca 100644
--- a/deploy/hps/server_env/cpu_version.txt
+++ b/deploy/hps/server_env/cpu_version.txt
@@ -1 +1 @@
-0.3.9
+0.3.13
diff --git a/deploy/hps/server_env/gpu_version.txt b/deploy/hps/server_env/gpu_version.txt
index 5503126d59..0b69c00c5f 100644
--- a/deploy/hps/server_env/gpu_version.txt
+++ b/deploy/hps/server_env/gpu_version.txt
@@ -1 +1 @@
-0.3.10
+0.3.14
diff --git a/deploy/hps/server_env/paddlex-hps-server/pyproject.toml b/deploy/hps/server_env/paddlex-hps-server/pyproject.toml
index 422e3e19b6..5d85392841 100644
--- a/deploy/hps/server_env/paddlex-hps-server/pyproject.toml
+++ b/deploy/hps/server_env/paddlex-hps-server/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "paddlex-hps-server"
-version = "0.3.0"
+version = "0.4.0"
 # `paddlex` is not included here
 dependencies = [
     "colorlog >= 6.9",
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py
index a09262bb07..838701f1e9 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py
@@ -61,10 +61,14 @@ def initialize(self, args):
         logging.info("Output names: %s", self.output_names)
 
         if args["model_instance_kind"] == "GPU":
+            if env.DEVICE_TYPE != "gpu":
+                raise pb_utils.TritonModelException(
+                    f"Expected device type to be 'gpu', but got {repr(env.DEVICE_TYPE)}"
+                )
             self._device_type = "gpu"
             self._device_id = int(args["model_instance_device_id"])
         elif args["model_instance_kind"] == "CPU":
-            self._device_type = "cpu"
+            self._device_type = env.DEVICE_TYPE
             self._device_id = None
         else:
             raise pb_utils.TritonModelException(
@@ -90,9 +94,6 @@ def execute(self, requests):
             outputs = {}
             log_ids = []
             for i, request in enumerate(requests):
-                log_id = protocol.generate_log_id()
-                logging.info("Request %s received", log_id)
-                log_ids.append(log_id)
                 input_ = pb_utils.get_input_tensor_by_name(
                     request, constants.INPUT_NAME
                 )
@@ -102,10 +103,19 @@ def execute(self, requests):
                     input_ = protocol.parse_triton_input(input_, input_model_type)
                     inputs[i] = input_
                 except ValidationError as e:
+                    log_id = protocol.generate_log_id()
                     output = protocol.create_aistudio_output_without_result(
                         422, str(e), log_id=log_id
                     )
                     outputs[i] = output
+                else:
+                    log_id = (
+                        input_.logId
+                        if hasattr(input_, "logId") and input_.logId
+                        else protocol.generate_log_id()
+                    )
+                logging.info("Request %s received", log_id)
+                log_ids.append(log_id)
 
             if inputs:
                 try:
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py
index aec8b1a821..820019afc3 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py
@@ -14,6 +14,7 @@
 
 import os
 
+DEVICE_TYPE = os.getenv("PADDLEX_HPS_DEVICE_TYPE", "cpu")
 PIPELINE_CONFIG_PATH = os.getenv("PADDLEX_HPS_PIPELINE_CONFIG_PATH", "")
 USE_HPIP = os.getenv("PADDLEX_HPS_USE_HPIP", "")
 
diff --git a/deploy/hps/server_env/requirements/cpu.in b/deploy/hps/server_env/requirements/cpu.in
index b265f14dcd..328c4b54b8 100644
--- a/deploy/hps/server_env/requirements/cpu.in
+++ b/deploy/hps/server_env/requirements/cpu.in
@@ -1 +1 @@
-paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.1.1-cp310-cp310-linux_x86_64.whl
+paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.2.1-cp310-cp310-linux_x86_64.whl
diff --git a/deploy/hps/server_env/requirements/cpu.txt b/deploy/hps/server_env/requirements/cpu.txt
index bbbeaab8e4..c24a8fc5af 100644
--- a/deploy/hps/server_env/requirements/cpu.txt
+++ b/deploy/hps/server_env/requirements/cpu.txt
@@ -129,6 +129,8 @@ imageio==2.37.0
     # via scikit-image
 imagesize==1.4.1
     # via paddlex (../../../setup.py)
+jieba==0.42.1
+    # via paddlex (../../../setup.py)
 jinja2==3.1.5
     # via paddlex (../../../setup.py)
 jiter==0.8.2
@@ -171,6 +173,7 @@ lxml==5.3.1
     # via
     #   paddlex (../../../setup.py)
     #   premailer
+    #   python-docx
 markupsafe==3.0.2
     # via jinja2
 marshmallow==3.26.1
@@ -218,6 +221,8 @@ openai==1.63.2
     # via
     #   langchain-openai
     #   paddlex (../../../setup.py)
+opencc==1.1.9
+    # via paddlex (../../../setup.py)
 opencv-contrib-python==4.10.0.84
     # via
     #   -r requirements/app.in
@@ -238,7 +243,7 @@ packaging==24.2
     #   matplotlib
     #   paddlex (../../../setup.py)
     #   scikit-image
-paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.1.1-cp310-cp310-linux_x86_64.whl
+paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.2.1-cp310-cp310-linux_x86_64.whl
     # via -r requirements/cpu.in
 pandas==1.3.5
     # via paddlex (../../../setup.py)
@@ -289,12 +294,16 @@ pyparsing==3.2.1
     # via matplotlib
 pypdfium2==4.30.1
     # via paddlex (../../../setup.py)
+pypinyin==0.55.0
+    # via paddlex (../../../setup.py)
 python-bidi==0.6.6
     # via paddlex (../../../setup.py)
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
     #   pandas
+python-docx==1.2.0
+    # via paddlex (../../../setup.py)
 pytz==2025.1
     # via pandas
 pyyaml==6.0.2
@@ -326,8 +335,10 @@ ruamel-yaml==0.18.10
     # via paddlex (../../../setup.py)
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
-safetensors==0.6.2
-    # via paddlex (../../../setup.py)
+safetensors==0.7.0
+    # via
+    #   paddlepaddle
+    #   paddlex (../../../setup.py)
 scikit-image==0.24.0
     # via paddlex (../../../setup.py)
 scikit-learn==1.6.1
@@ -396,6 +407,7 @@ typing-extensions==4.12.2
     #   paddlex (../../../setup.py)
     #   pydantic
     #   pydantic-core
+    #   python-docx
     #   sqlalchemy
     #   typing-inspect
     #   uvicorn
diff --git a/deploy/hps/server_env/requirements/cpu_hpi.in b/deploy/hps/server_env/requirements/cpu_hpi.in
index 0651ab6e9b..34315f6f41 100644
--- a/deploy/hps/server_env/requirements/cpu_hpi.in
+++ b/deploy/hps/server_env/requirements/cpu_hpi.in
@@ -1,3 +1,4 @@
+onnx_graphsurgeon == 0.5.6
 onnxruntime == 1.22.0
 paddle2onnx == 2.0.2rc3
 ultra-infer-python @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/hpi/ultra_infer/releases/new_hpi/v1.2.0/ultra_infer_python-1.2.0-cp310-cp310-linux_x86_64.whl
diff --git a/deploy/hps/server_env/requirements/cpu_hpi.txt b/deploy/hps/server_env/requirements/cpu_hpi.txt
index 61dd0c4bc3..3dbd6c686b 100644
--- a/deploy/hps/server_env/requirements/cpu_hpi.txt
+++ b/deploy/hps/server_env/requirements/cpu_hpi.txt
@@ -28,13 +28,17 @@ numpy==1.24.4
     # via
     #   -c requirements/cpu.txt
     #   onnx
+    #   onnx-graphsurgeon
     #   onnxruntime
     #   opencv-contrib-python
     #   ultra-infer-python
 onnx==1.17.0
     # via
+    #   onnx-graphsurgeon
     #   onnxoptimizer
     #   paddle2onnx
+onnx-graphsurgeon==0.5.6
+    # via -r requirements/cpu_hpi.in
 onnxoptimizer==0.3.13
     # via paddle2onnx
 onnxruntime==1.22.0
diff --git a/deploy/hps/server_env/requirements/gpu.in b/deploy/hps/server_env/requirements/gpu.in
index 152945b4c3..f2cbd2c900 100644
--- a/deploy/hps/server_env/requirements/gpu.in
+++ b/deploy/hps/server_env/requirements/gpu.in
@@ -1 +1 @@
-paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.1.1%2Bfc-cp310-cp310-linux_x86_64.whl
+paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.2.1%2Bfc-cp310-cp310-linux_x86_64.whl
diff --git a/deploy/hps/server_env/requirements/gpu.txt b/deploy/hps/server_env/requirements/gpu.txt
index 3484d72f48..caa9a8fbc0 100644
--- a/deploy/hps/server_env/requirements/gpu.txt
+++ b/deploy/hps/server_env/requirements/gpu.txt
@@ -129,6 +129,8 @@ imageio==2.37.0
     # via scikit-image
 imagesize==1.4.1
     # via paddlex (../../../setup.py)
+jieba==0.42.1
+    # via paddlex (../../../setup.py)
 jinja2==3.1.5
     # via paddlex (../../../setup.py)
 jiter==0.8.2
@@ -171,6 +173,7 @@ lxml==5.3.1
     # via
     #   paddlex (../../../setup.py)
     #   premailer
+    #   python-docx
 markupsafe==3.0.2
     # via jinja2
 marshmallow==3.26.1
@@ -218,6 +221,8 @@ openai==1.63.2
     # via
     #   langchain-openai
     #   paddlex (../../../setup.py)
+opencc==1.1.9
+    # via paddlex (../../../setup.py)
 opencv-contrib-python==4.10.0.84
     # via
     #   -r requirements/app.in
@@ -238,7 +243,7 @@ packaging==24.2
     #   matplotlib
     #   paddlex (../../../setup.py)
     #   scikit-image
-paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.1.1%2Bfc-cp310-cp310-linux_x86_64.whl
+paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.2.1%2Bfc-cp310-cp310-linux_x86_64.whl
     # via -r requirements/gpu.in
 pandas==1.3.5
     # via paddlex (../../../setup.py)
@@ -289,12 +294,16 @@ pyparsing==3.2.1
     # via matplotlib
 pypdfium2==4.30.1
     # via paddlex (../../../setup.py)
+pypinyin==0.55.0
+    # via paddlex (../../../setup.py)
 python-bidi==0.6.6
     # via paddlex (../../../setup.py)
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
     #   pandas
+python-docx==1.2.0
+    # via paddlex (../../../setup.py)
 pytz==2025.1
     # via pandas
 pyyaml==6.0.2
@@ -326,8 +335,10 @@ ruamel-yaml==0.18.10
     # via paddlex (../../../setup.py)
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
-safetensors==0.6.2
-    # via paddlex (../../../setup.py)
+safetensors==0.7.0
+    # via
+    #   paddlepaddle-gpu
+    #   paddlex (../../../setup.py)
 scikit-image==0.24.0
     # via paddlex (../../../setup.py)
 scikit-learn==1.6.1
@@ -396,6 +407,7 @@ typing-extensions==4.12.2
     #   paddlex (../../../setup.py)
     #   pydantic
     #   pydantic-core
+    #   python-docx
     #   sqlalchemy
     #   starlette
     #   typing-inspect
diff --git a/deploy/hps/server_env/requirements/gpu_hpi.in b/deploy/hps/server_env/requirements/gpu_hpi.in
index c0dcaf4928..62f1da3240 100644
--- a/deploy/hps/server_env/requirements/gpu_hpi.in
+++ b/deploy/hps/server_env/requirements/gpu_hpi.in
@@ -1,2 +1,4 @@
+onnx_graphsurgeon == 0.5.6
+onnxruntime == 1.22.0
 paddle2onnx == 2.0.2rc3
 ultra-infer-gpu-python @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/hpi/ultra_infer/releases/new_hpi/v1.2.0/ultra_infer_gpu_python-1.2.0-cp310-cp310-linux_x86_64.whl
diff --git a/deploy/hps/server_env/requirements/gpu_hpi.txt b/deploy/hps/server_env/requirements/gpu_hpi.txt
index de1a847d4e..851f058e2e 100644
--- a/deploy/hps/server_env/requirements/gpu_hpi.txt
+++ b/deploy/hps/server_env/requirements/gpu_hpi.txt
@@ -12,26 +12,45 @@ charset-normalizer==3.4.1
     # via
     #   -c requirements/gpu.txt
     #   requests
+coloredlogs==15.0.1
+    # via onnxruntime
+flatbuffers==25.9.23
+    # via onnxruntime
+humanfriendly==10.0
+    # via coloredlogs
 idna==3.10
     # via
     #   -c requirements/gpu.txt
     #   requests
+mpmath==1.3.0
+    # via sympy
 numpy==1.24.4
     # via
     #   -c requirements/gpu.txt
     #   onnx
+    #   onnx-graphsurgeon
+    #   onnxruntime
     #   opencv-contrib-python
     #   ultra-infer-gpu-python
 onnx==1.17.0
     # via
+    #   onnx-graphsurgeon
     #   onnxoptimizer
     #   paddle2onnx
+onnx-graphsurgeon==0.5.6
+    # via -r requirements/gpu_hpi.in
 onnxoptimizer==0.3.13
     # via paddle2onnx
+onnxruntime==1.22.0
+    # via -r requirements/gpu_hpi.in
 opencv-contrib-python==4.10.0.84
     # via
     #   -c requirements/gpu.txt
     #   ultra-infer-gpu-python
+packaging==24.2
+    # via
+    #   -c requirements/gpu.txt
+    #   onnxruntime
 paddle2onnx==2.0.2rc3
     # via -r requirements/gpu_hpi.in
 polygraphy==0.49.20
@@ -40,6 +59,7 @@ protobuf==5.29.3
     # via
     #   -c requirements/gpu.txt
     #   onnx
+    #   onnxruntime
 pyyaml==6.0.2
     # via
     #   -c requirements/gpu.txt
@@ -48,6 +68,8 @@ requests==2.32.3
     # via
     #   -c requirements/gpu.txt
     #   ultra-infer-gpu-python
+sympy==1.14.0
+    # via onnxruntime
 tqdm==4.67.1
     # via
     #   -c requirements/gpu.txt
diff --git a/deploy/hps/server_env/scripts/tag_and_push_images.sh b/deploy/hps/server_env/scripts/tag_and_push_images.sh
index e7b114565d..fc334a13d5 100755
--- a/deploy/hps/server_env/scripts/tag_and_push_images.sh
+++ b/deploy/hps/server_env/scripts/tag_and_push_images.sh
@@ -5,7 +5,7 @@ paddlex_version="$(cat ../../../paddlex/.version)"
 for device_type in 'gpu' 'cpu'; do
     version="$(cat "${device_type}_version.txt")"
     docker push "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:latest-${device_type}"
-    for tag in "${version}-${device_type}" "paddlex${paddlex_version%.*}-${device_type}"; do
+    for tag in "${version}-paddlex${paddlex_version}-${device_type}" "paddlex${paddlex_version%.*}-${device_type}"; do
         docker tag "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:latest-${device_type}" "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:${tag}"
         docker push "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:${tag}"
     done
diff --git a/docs/FAQ.md b/docs/FAQ.md
index 088d500d5d..7416aba52b 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -54,6 +54,11 @@ A：可以：
 2. 设置全局预训练模型缓存路径，例如：`paddlex.pretrain_dir='/usrname/paddlex'`，已下载模型将不会重复下载。
 
 
+## <b>Q：每次导入`paddlex`都会卡住一会，为什么？</b>
+
+1. 因为每次启动，`paddlex`会默认自动测试模型托管平台的网络联通性（包括huggingface、aistudio、modelscope），以确定后续自动下载模型时选择哪个平台；
+2. 如果确定使用本地模型，不需要测试检查，可以设置环境变量`PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK=1`来禁用；
+
 
 ## <b>Q：当我在使用PaddleX的过程中遇到问题，应该怎样反馈呢？</b>
 
diff --git a/docs/installation/installation.en.md b/docs/installation/installation.en.md
index 795d901dde..267a5e6729 100644
--- a/docs/installation/installation.en.md
+++ b/docs/installation/installation.en.md
@@ -148,16 +148,16 @@ If your Docker version >= 19.03, please use:
 
 ```bash
 # For CPU
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-cpu /bin/bash
 
 # gpu，requires GPU driver version ≥450.80.02 (Linux) or ≥452.39 (Windows)
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # gpu，requires GPU driver version ≥545.23.06（Linux） or ≥545.84（Windows）
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
 
 # gpu，requires GPU driver version ≥550.xx
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
 ```
 
 * If your Docker version <= 19.03 and >= 17.06, please use:
@@ -165,17 +165,17 @@ docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=hos
 <details><summary> Click Here</summary>
 
 <pre><code class="language-bash"># For CPU
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-cpu /bin/bash
 
 # For GPU
 # gpu，requires GPU driver version ≥450.80.02 (Linux) or ≥452.39 (Windows)
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # gpu，requires GPU driver version ≥545.23.06（Linux） or ≥545.84（Windows）
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
 
 # gpu，requires GPU driver version ≥550.xx
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
 
 </code></pre></details>
 
diff --git a/docs/installation/installation.md b/docs/installation/installation.md
index 46ab4195e9..cc6f35bfff 100644
--- a/docs/installation/installation.md
+++ b/docs/installation/installation.md
@@ -155,17 +155,17 @@ paddlex --install PaddleXXX  # 例如PaddleOCR
 
 ```bash
 # 对于 CPU 用户
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-cpu /bin/bash
 
 # 对于 GPU 用户
 # GPU 版本，需显卡驱动程序版本 ≥450.80.02（Linux）或 ≥452.39（Windows）
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥545.23.06（Linux）或 ≥545.84（Windows）
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥550.xx
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
 ```
 
 
@@ -174,16 +174,16 @@ docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=hos
 <details><summary> 点击展开</summary>
 
 <pre><code class="language-bash"># 对于 CPU 用户
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-cpu /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥450.80.02（Linux）或 ≥452.39（Windows）
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥545.23.06（Linux）或 ≥545.84（Windows）
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.6-cudnn9.5 /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥550.xx
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.4-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.3.11-paddlepaddle3.2.0-gpu-cuda12.9-cudnn9.9 /bin/bash
 
 </code></pre></details>
 
diff --git a/docs/module_usage/tutorials/speech_modules/text_to_pinyin.en.md b/docs/module_usage/tutorials/speech_modules/text_to_pinyin.en.md
new file mode 100644
index 0000000000..2e4c7c3571
--- /dev/null
+++ b/docs/module_usage/tutorials/speech_modules/text_to_pinyin.en.md
@@ -0,0 +1,182 @@
+---
+comments: true
+---
+
+# Tutorial for Text To Pinyin Module
+
+## I. Overview
+Text to Pinyin is commonly used in the frontend of TTS to convert input Chinese text into a phonetic sequence with tones, providing pronunciation basis for subsequent acoustic models and audio generation.
+
+## II. Supported Model List
+
+<table>
+  <tr>
+    <th >Model</th>
+    <th >Download link</th>
+    <th >Model size</th>
+    <th >Introduction</th>
+  </tr>
+  <tr>
+    <td>G2PWModel</td>
+    <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/G2PWModel.tar">G2PWModel</a></td>
+    <td>606M</td>
+    <td rowspan="5"> g2pW is an open-source text to pinyin model, commonly used in the frontend of TTS. It converts input Chinese text into a tonal Pinyin sequence, providing pronunciation basis for subsequent acoustic models and audio generation</td>
+  </tr>
+</table>
+
+## III. Quick Integration
+Before quick integration, you need to install the PaddleX wheel package. For the installation method, please refer to the [PaddleX Local Installation Tutorial](../../../installation/installation.en.md). After installing the wheel package, a few lines of code can complete the inference of the text to pinyin module. You can switch models under this module freely, and you can also integrate the model inference of the text to pinyin module into your project.
+
+
+```python
+from paddlex import create_model
+model = create_model(model_name="G2PWModel")
+output = model.predict(input="欢迎使用飞桨", batch_size=1)
+for res in output:
+    res.print()
+    res.save_to_json(save_path="./output/res.json")
+```
+
+After running, the result obtained is:
+
+```bash
+{'res': {'input_path': '欢迎使用飞桨', 'result': ['huan1', 'ying2', 'shi3', 'yong4', 'fei1', 'jiang3']}}
+```
+
+The meanings of the runtime parameters are as follows:
+- `input_path`: The storage path of the input text.
+- `result`: Pinyin converted from the input text.
+
+Related methods, parameters, and explanations are as follows:
+* `create_model` for text to pinyin model, with specific explanations as follows:
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Description</th>
+<th>Type</th>
+<th>Options</th>
+<th>Default Value</th>
+</tr>
+</thead>
+<tr>
+<td><code>model_name</code></td>
+<td>The name of the model</td>
+<td><code>str</code></td>
+<td><code>G2PWModel</code></td>
+<td><code>G2PWModel</code></td>
+</tr>
+<tr>
+<td><code>model_dir</code></td>
+<td>The storage path of the model</td>
+<td><code>str</code></td>
+<td>None</td>
+<td>None</td>
+</tr>
+</table>
+
+* The `model_name` must be specified. After specifying `model_name`, the built-in model parameters of PaddleX are used by default. If `model_dir` is specified, the user-defined model is used.
+
+* The `predict()` method of the text to pinyin model is called for inference and prediction. The parameters of the `predict()` method are `input` and `batch_size`, with specific explanations as follows:
+
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Description</th>
+<th>Type</th>
+<th>Options</th>
+<th>Default Value</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>Data to be predicted</td>
+<td><code>str</code></td>
+<td>
+<ul>
+  Input text, such as: <code>欢迎使用飞桨</code>
+</ul>
+</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>batch_size</code></td>
+<td>Batch size</td>
+<td><code>int</code></td>
+<td>Currently only supports 1</td>
+<td>1</td>
+</tr>
+</table>
+
+* The prediction results are processed as `dict` type for each sample and support the operation of saving as a `json` file:
+
+<table>
+<thead>
+<tr>
+<th>Method</th>
+<th>Description</th>
+<th>Parameter</th>
+<th>Parameter Type</th>
+<th>Parameter Description</th>
+<th>Default Value</th>
+</tr>
+</thead>
+<tr>
+<td rowspan="3"><code>print()</code></td>
+<td rowspan="3">Print the result to the terminal</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>Whether to format the output content with <code>JSON</code> indentation</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specify the indentation level to beautify the output <code>JSON</code> data, making it more readable. This is only effective when <code>format_json</code> is <code>True</code></td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Control whether to escape non-<code>ASCII</code> characters to <code>Unicode</code>. When set to <code>True</code>, all non-<code>ASCII</code> characters will be escaped; <code>False</code> retains the original characters. This is only effective when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan="3"><code>save_to_json()</code></td>
+<td rowspan="3">Save the result as a file in <code>json</code> format</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>The file path for saving. When it is a directory, the saved file name will match the input file name</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specify the indentation level to beautify the output <code>JSON</code> data, making it more readable. This is only effective when <code>format_json</code> is <code>True</code></td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Control whether to escape non-<code>ASCII</code> characters to <code>Unicode</code>. When set to <code>True</code>, all non-<code>ASCII</code> characters will be escaped; <code>False</code> retains the original characters. This is only effective when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+</table>
+
+* Additionally, the prediction results can also be obtained through attributes, as follows:
+
+<table>
+<thead>
+<tr>
+<th>Attribute</th>
+<th>Description</th>
+</tr>
+</thead>
+<tr>
+<td rowspan="1"><code>json</code></td>
+<td rowspan="1">Get the prediction result in <code>json</code> format</td>
+</tr>
+</table>
+
+For more information on using PaddleX's single-model inference APIs, please refer to the [PaddleX Single-Model Python Script Usage Instructions](../../instructions/model_python_API.en.md).
diff --git a/docs/module_usage/tutorials/speech_modules/text_to_pinyin.md b/docs/module_usage/tutorials/speech_modules/text_to_pinyin.md
new file mode 100644
index 0000000000..2481170342
--- /dev/null
+++ b/docs/module_usage/tutorials/speech_modules/text_to_pinyin.md
@@ -0,0 +1,179 @@
+---
+comments: true
+---
+
+# 文本转拼音模块使用教程
+
+## 一、概述
+文本到拼音常用于语音合成的前端，将输入的中文文本转换为带声调的拼音序列，为后续的声学模型和模型生成提供发音依据。
+
+## 二、支持模型列表
+
+<table>
+  <tr>
+    <th >模型</th>
+    <th >模型下载链接</th>
+    <th >模型大小</th>
+    <th >介绍</th>
+  </tr>
+  <tr>
+    <td>G2PWModel</td>
+    <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/G2PWModel.tar">G2PWModel</a></td>
+    <td>606M</td>
+    <td rowspan="5"> g2pW 开源的文本到拼音模型，常用于语音合成的前端，将输入的中文文本转换为带声调的拼音序列，为后续的声学模型和模型生成提供发音依据</td>
+  </tr>
+</table>
+
+## 三、快速集成
+在快速集成前，首先需要安装 PaddleX 的 wheel 包，wheel的安装方式请参考[PaddleX本地安装教程](../../../installation/installation.md)。完成 wheel 包的安装后，几行代码即可完成文本转拼音模块的推理，可以任意切换该模块下的模型，您也可以将文本转拼音模块中的模型推理集成到您的项目中。
+
+```python
+from paddlex import create_model
+model = create_model(model_name="G2PWModel")
+output = model.predict(input="欢迎使用飞桨", batch_size=1)
+for res in output:
+    res.print()
+    res.save_to_json(save_path="./output/res.json")
+```
+运行后，得到的结果为：
+```bash
+{'res': {'input_path': '欢迎使用飞桨', 'result': ['huan1', 'ying2', 'shi3', 'yong4', 'fei1', 'jiang3']}}
+```
+运行结果参数含义如下：
+- `input_path`: 输入文本
+- `result`: 输入文本转换后的拼音
+
+相关方法、参数等说明如下：
+* `create_model`文本转拼音模型，具体说明如下：
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>model_name</code></td>
+<td>模型名称</td>
+<td><code>str</code></td>
+<td><code>G2PWModel</code></td>
+<td><code>G2PWModel</code></td>
+</tr>
+<tr>
+<td><code>model_dir</code></td>
+<td>模型存储路径</td>
+<td><code>str</code></td>
+<td>无</td>
+<td>无</td>
+</tr>
+</table>
+
+* 其中，`model_name` 必须指定，指定 `model_name` 后，默认使用 PaddleX 内置的模型参数，在此基础上，指定 `model_dir` 时，使用用户自定义的模型。
+
+* 调用文本转拼音模型的 `predict()` 方法进行推理预测，`predict()` 方法参数有 `input` 和 `batch_size`，具体说明如下：
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>待预测数据</td>
+<td><code>str</code></td>
+<td>
+<ul>
+  对应文本，如：<code>欢迎使用飞桨</code>
+</ul>
+</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>batch_size</code></td>
+<td>批大小</td>
+<td><code>int</code></td>
+<td>目前仅支持1</td>
+<td>1</td>
+</tr>
+</table>
+
+* 对预测结果进行处理，每个样本的预测结果均为对应的Result对象，支持保存为`json`文件的操作:
+
+<table>
+<thead>
+<tr>
+<th>方法</th>
+<th>方法说明</th>
+<th>参数</th>
+<th>参数类型</th>
+<th>参数说明</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">打印结果到终端</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>是否对输出内容进行使用 <code>JSON</code> 缩进格式化</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "3"><code>save_to_json()</code></td>
+<td rowspan = "3">将结果保存为json格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，当为目录时，保存文件命名与输入文件类型命名一致</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+</table>
+
+* 此外，也支持通过属性获取预测结果，具体如下：
+
+<table>
+<thead>
+<tr>
+<th>属性</th>
+<th>属性说明</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "1"><code>json</code></td>
+<td rowspan = "1">获取预测的<code>json</code>格式的结果</td>
+</tr>
+
+</table>
+
+关于更多 PaddleX 的单模型推理的 API 的使用方法，可以参考[PaddleX单模型Python脚本使用说明](../../instructions/model_python_API.md)。
diff --git a/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.en.md b/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.en.md
new file mode 100644
index 0000000000..21c3cbda2f
--- /dev/null
+++ b/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.en.md
@@ -0,0 +1,206 @@
+---
+comments: true
+---
+
+# 语音合成声学模型使用教程
+
+## I. Overview
+The acoustic model for speech synthesis is the core component of speech synthesis technology. Its key characteristic lies in utilizing deep learning and other techniques to transform text into lifelike voice output while enabling fine-grained control over features such as speech rate and prosody. It is primarily applied in fields such as intelligent voice assistants, navigation announcements, and film and television dubbing.
+
+## Supported Model List
+
+### Fastspeech Model
+<table>
+  <tr>
+    <th >Model</th>
+    <th >Download link</th>
+    <th >training data</th>
+    <th >Introduction</th>
+    <th >Model Storage Size (MB)</th>
+  </tr>
+  <tr>
+    <td>fastspeech2_csmsc</td>
+    <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/fastspeech2_csmsc.tar">fastspeech2_csmsc</a></td>
+    <td>\</td>
+    <td>157M</td>
+    <td rowspan="1">FastSpeech2 is an end-to-end text-to-speech (TTS) model developed by Microsoft, featuring efficient and stable prosody control. It adopts a non-autoregressive architecture that enables fast and high-quality speech synthesis, suitable for various scenarios such as virtual assistants and audiobooks.</td>
+  </tr>
+</table>
+
+## 3. Quick Integration
+Before quick integration, first install the PaddleX wheel package. For wheel installation methods, please refer to [PaddleX Local Installation Tutorial](../../../installation/installation.md). After installing the wheel package, inference for the multilingual speech synthesis acoustic module can be completed with just a few lines of code. You can freely switch models within this module, or integrate model inference from the multilingual speech synthesis module into your project.
+<!-- Before running the following code, please download the [sample audio](https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav){target="_blank"} to your local machine. -->
+
+```python
+from paddlex import create_model
+model = create_model(model_name="fastspeech2_csmsc")
+output = model.predict(input=[151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186, 38, 233]. , batch_size=1)
+for res in output:
+    res.print()
+    res.save_to_json(save_path="./output/res.json")
+```
+After running, the results are:
+```bash
+{'result': array([[-2.96321  , ..., -4.552117 ],
+  ...,
+  [-2.0465052, ..., -3.695221 ]], dtype=float32)}
+```
+
+The meanings of the running result parameters are as follows:
+
+- `input_path`: Input audio storage path
+- `result`: Output mel spectrogram result
+
+
+Explanations of related methods and parameters are as follows:
+
+* `create_model`multilingual recognition model (using `fastspeech2_csmsc` as an example), details are as follows:
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Parameter Description</th>
+<th>Type</th>
+<th>Options</th>
+<th>Default Value</th>
+</tr>
+</thead>
+<tr>
+<td><code>model_name</code></td>
+<td>The name of the model</td>
+<td><code>str</code></td>
+<td><code>fastspeech2_csmsc</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>model_dir</code></td>
+<td>The storage path of the model</td>
+<td><code>str</code></td>
+<td>None</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>Model inference device</td>
+<td><code>str</code></td>
+<td>Supports specifying specific GPU card numbers (e.g. "gpu:0"), other hardware card numbers (e.g. "npu:0"), or CPU (e.g. "cpu")</td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>Whether to enable high-performance inference plugin. Currently not supported.</td>
+<td><code>bool</code></td>
+<td>None</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>High-performance inference configuration. Currently not supported.</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>None</td>
+<td><code>None</code></td>
+</tr>
+</table>
+
+* Among these, `model_name` must be specified. After specifying `model_name`, PaddleX's built-in model parameters are used by default. When `model_dir` is specified, the user-defined model is used.
+* Call the multilingual speech synthesis model's `predict()` method for inference prediction. The `predict()` method parameters include `input` and `batch_size`, with details as follows:
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Parameter Description</th>
+<th>Type</th>
+<th>Options</th>
+<th>Default</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>Data to be predicted</td>
+<td><code>str</code></td>
+<td>
+Currently only supports tensor-type input_phone_ids, such as [151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186], etc.
+</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>batch_size</code></td>
+<td>batch size</td>
+<td><code>int</code></td>
+<td>Currently only 1 is supported</td>
+<td>1</td>
+</tr>
+</table>
+
+* Process the prediction results. The prediction result for each sample is a corresponding Result object, which supports saving as `json` files:
+
+<table>
+<thead>
+<tr>
+<th>Method</th>
+<th>Method Description</th>
+<th>Parameter</th>
+<th>Type</th>
+<th>Parameter Description</th>
+<th>Default</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">Print results to terminal</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>Whether to format the output content with <code>JSON</code> indentation</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specifies the indentation level to beautify output <code>JSON</code> data for better readability. Only effective when <code>format_json</code> is <code>True</code></td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Controls whether to escape non-<code>ASCII</code> characters to Unicode. When set to <code>True</code>, all non-ASCII characters will be escaped; <code>False</code> preserves original characters. Only effective when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "3"><code>save_to_json()</code></td>
+<td rowspan = "3">Save results as json file</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>File save path. When it is a directory, the file name follows the input file type naming convention</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specifies the indentation level to beautify output <code>JSON</code> data for better readability. Only effective when <code>format_json</code> is <code>True</code>/td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Controls whether to escape non-<code>ASCII</code> characters to Unicode. When set to <code>True</code>, all non-<code>ASCII</code> characters will be escaped; <code>False</code> preserves original characters. Only effective when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+</table>
+
+* Additionally, the prediction results can also be obtained through attributes:
+
+<table>
+<thead>
+<tr>
+<th>Attribute</th>
+<th>Description</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "1"><code>json</code></td>
+<td rowspan = "1">Get prediction results in <code>json</code> format</td>
+</tr>
+
+</table>
+
+For more information on using PaddleX's single-model inference APIs, please refer to the [PaddleX Single-Model Python Script Usage Instructions](../../instructions/model_python_API.en.md).
diff --git a/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.md b/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.md
new file mode 100644
index 0000000000..7373d7a06a
--- /dev/null
+++ b/docs/module_usage/tutorials/speech_modules/text_to_speech_acoustic.md
@@ -0,0 +1,204 @@
+---
+comments: true
+---
+
+# 语音合成声学模型使用教程
+
+## 一、概述
+语音合成声学模型是语音合成技术的核心组件，其特点在于通过深度学习等技术将文本转化为逼真的语音输出，并支持细粒度控制语速、韵律等特征。主要应用于智能语音助手、导航播报、影视配音等领域。
+
+## 二、支持模型列表
+
+### Fastspeech Model
+<table>
+  <tr>
+    <th >模型</th>
+    <th >模型下载链接</th>
+    <th >训练数据</th>
+    <th>模型存储大小（MB）</th>
+    <th >介绍</th>
+  </tr>
+  <tr>
+    <td>fastspeech2_csmsc</td>
+    <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/fastspeech2_csmsc.tar">fastspeech2_csmsc</a></td>
+    <td>\</td>
+    <td>157M</td>
+    <td rowspan="1">FastSpeech2 是微软开发的端到端文本转语音（TTS）模型，具备高效稳定的韵律控制能力。它采用非自回归架构，能实现快速高质量的语音合成，适用于虚拟助手、有声读物等多种场景。</td>
+  </tr>
+</table>
+
+## 三、快速集成
+在快速集成前，首先需要安装 PaddleX 的 wheel 包，wheel的安装方式请参考[PaddleX本地安装教程](../../../installation/installation.md)。完成 wheel 包的安装后，几行代码即可完成多语种语音合成声学模块的推理，可以任意切换该模块下的模型，您也可以将多语种语音合成模块中的模型推理集成到您的项目中。
+<!-- 运行以下代码前，请您下载[示例语音](https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav)到本地。 -->
+
+```python
+from paddlex import create_model
+model = create_model(model_name="fastspeech2_csmsc")
+output = model.predict(input=[151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186, 38, 233]. , batch_size=1)
+for res in output:
+    res.print()
+    res.save_to_json(save_path="./output/res.json")
+```
+运行后，得到的结果为：
+```bash
+{'result': array([[-2.96321  , ..., -4.552117 ],
+  ...,
+  [-2.0465052, ..., -3.695221 ]], dtype=float32)}
+```
+运行结果参数含义如下：
+- `input_path`: 输入音频存放路径
+- `result`: 输出mel谱结果
+
+相关方法、参数等说明如下：
+* `create_model`多语种识别模型（此处以`fastspeech2_csmsc`为例），具体说明如下：
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>model_name</code></td>
+<td>模型名称</td>
+<td><code>str</code></td>
+<td><code>fastspeech2_csmsc</code></td>
+<td><code>无</code></td>
+</tr>
+<tr>
+<td><code>model_dir</code></td>
+<td>模型存储路径</td>
+<td><code>str</code></td>
+<td>无</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>模型推理设备</td>
+<td><code>str</code></td>
+<td>支持指定GPU具体卡号，如“gpu:0”，其他硬件具体卡号，如“npu:0”，CPU如“cpu”。</td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>是否启用高性能推理插件。目前暂不支持。</td>
+<td><code>bool</code></td>
+<td>无</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>高性能推理配置。目前暂不支持。</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>无</td>
+<td><code>None</code></td>
+</tr>
+</table>
+
+* 其中，`model_name` 必须指定，指定 `model_name` 后，默认使用 PaddleX 内置的模型参数，在此基础上，指定 `model_dir` 时，使用用户自定义的模型。
+
+* 调用多语种语音识别模型的 `predict()` 方法进行推理预测，`predict()` 方法参数有 `input` 和 `batch_size`，具体说明如下：
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>待预测数据</td>
+<td><code>str</code></td>
+<td>
+  输入的input_phone_ids, 目前只支持tensor类型，如[151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186]等
+</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>batch_size</code></td>
+<td>批大小</td>
+<td><code>int</code></td>
+<td>目前仅支持1</td>
+<td>1</td>
+</tr>
+</table>
+
+* 对预测结果进行处理，每个样本的预测结果均为对应的Result对象，支持保存为`json`文件的操作:
+
+<table>
+<thead>
+<tr>
+<th>方法</th>
+<th>方法说明</th>
+<th>参数</th>
+<th>参数类型</th>
+<th>参数说明</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">打印结果到终端</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>是否对输出内容进行使用 <code>JSON</code> 缩进格式化</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "3"><code>save_to_json()</code></td>
+<td rowspan = "3">将结果保存为json格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，当为目录时，保存文件命名与输入文件类型命名一致</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+</table>
+
+* 此外，也支持通过属性获取预测结果，具体如下：
+
+<table>
+<thead>
+<tr>
+<th>属性</th>
+<th>属性说明</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "1"><code>json</code></td>
+<td rowspan = "1">获取预测的<code>json</code>格式的结果</td>
+</tr>
+
+</table>
+
+关于更多 PaddleX 的单模型推理的 API 的使用方法，可以参考[PaddleX单模型Python脚本使用说明](../../instructions/model_python_API.md)。
diff --git a/docs/module_usage/tutorials/speech_modules/text_to_speech_vocoder.en.md b/docs/module_usage/tutorials/speech_modules/text_to_speech_vocoder.en.md
new file mode 100644
index 0000000000..18e1177a2d
--- /dev/null
+++ b/docs/module_usage/tutorials/speech_modules/text_to_speech_vocoder.en.md
@@ -0,0 +1,207 @@
+---
+comments: true
+---
+
+# Tutorial for Text to Speech Vocoder
+
+## I. Overview
+The speech synthesis vocoder is a key component of speech synthesis technology. It utilizes signal processing algorithms to convert spectral parameters generated by acoustic models into playable waveform audio, while supporting adjustments of audio characteristics such as timbre and pitch. It is primarily applied in backend processing of speech synthesis systems, audio effect optimization, and personalized voice generation.
+
+## II. Supported Model List
+
+### Fastspeech Model
+<table>
+  <tr>
+    <th >Model</th>
+    <th >Download link</th>
+    <th >Training Data</th>
+    <th>Model Storage Size (MB)</th>
+    <th >Introduction</th>
+  </tr>
+  <tr>
+    <td>pwgan_csmsc</td>
+    <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/pwgan_csmsc.tar">pwgan_csmsc</a></td>
+    <td>\</td>
+    <td>5.1M</td>
+    <td rowspan="1">Parallel WaveGAN (PWGAN) is an end-to-end speech synthesis vocoder developed by Nagoya University, Japan, featuring efficient and stable waveform generation capabilities. It adopts a generative adversarial network architecture that enables fast and high-fidelity speech reconstruction, suitable for various scenarios such as TTS system backend processing and speech enhancement.</td>
+  </tr>
+</table>
+
+## III. Quick Integration
+Before quick integration, first install the PaddleX wheel package. For wheel installation methods, please refer to [PaddleX Local Installation Tutorial](../../../installation/installation.md). After installing the wheel package, inference for the multilingual speech synthesis acoustic module can be completed with just a few lines of code. You can freely switch models within this module, or integrate model inference from the multilingual speech synthesis module into your project.
+Before running the following code, please download the [sample npy](https://paddlespeech.bj.bcebos.com/demos/paddlex/mel.npy){target="_blank"} to your local machine.
+
+```python
+from paddlex import create_model
+model = create_model(model_name="pwgan_csmsc")
+output = model.predict(input='./mel.npy'. , batch_size=1)
+for res in output:
+    res.print()
+    res.save_to_json(save_path="./output/res.json")
+```
+After running, the results are
+```bash
+{'result': array([-9.085755e-04, ...,  8.858787e-05], dtype=float32)}
+```
+The meanings of the running result parameters are as follows:
+- `result`: Output wav audio data, of type `numpy.ndarray`。
+
+Explanations of related methods and parameters are as follows:
+* `create_model`speech synthesis vocoder (using `pwgan_csmsc` as an example), details are as follows:
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Parameter Description</th>
+<th>Type</th>
+<th>Options</th>
+<th>Default Value</th>
+</tr>
+</thead>
+<tr>
+<td><code>model_name</code></td>
+<td>The name of the model</td>
+<td><code>str</code></td>
+<td><code>pwgan_csmsc</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>model_dir</code></td>
+<td>The storage path of the model</td>
+<td><code>str</code></td>
+<td>None</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>Model inference device</td>
+<td><code>str</code></td>
+<td>Supports specifying specific GPU card numbers (e.g. "gpu:0"), other hardware card numbers (e.g. "npu:0"), or CPU (e.g. "cpu")</td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>Whether to enable high-performance inference plugin. Currently not supported.</td>
+<td><code>bool</code></td>
+<td>None</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>High-performance inference configuration. Currently not supported.</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>None</td>
+<td><code>None</code></td>
+</tr>
+</table>
+
+* Among these, `model_name` must be specified. After specifying `model_name`, PaddleX's built-in model parameters are used by default. When `model_dir` is specified, the user-defined model is used.
+* Call the multilingual speech synthesis model's `predict()` method for inference prediction. The `predict()` method parameters include `input` and `batch_size`, with details as follows:
+
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Parameter Description</th>
+<th>Type</th>
+<th>Options</th>
+<th>Default</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>Data to be predicted<</td>
+<td><code>str</code></td>
+<td>
+  Input mel spectrogram, can be npy file path or tensor type.
+</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>batch_size</code></td>
+<td>Batch size</td>
+<td><code>int</code></td>
+<td>Currently only 1 is supported</td>
+<td>1</td>
+</tr>
+</table>
+
+* Process the prediction results. The prediction result for each sample is a corresponding Result object, which supports saving as `json` files:
+
+<table>
+<thead>
+<tr>
+<th>Method</th>
+<th>Method Description</th>
+<th>Parameter</th>
+<th>Type</th>
+<th>Parameter Description</th>
+<th>Default</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">Print results to terminal</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>Whether to format the output content with <code>JSON</code> indentation</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specifies the indentation level to beautify output <code>JSON</code> data for better readability. Only effective when <code>format_json</code> is <code>True</code></td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Controls whether to escape non-<code>ASCII</code> characters to Unicode. When set to <code>True</code>, all non-ASCII characters will be escaped; <code>False</code> preserves original characters. Only effective when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "3"><code>save_to_json()</code></td>
+<td rowspan = "3">Save results as json file</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>File save path. When it is a directory, the file name follows the input file type naming convention</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specifies the indentation level to beautify output <code>JSON</code> data for better readability. Only effective when <code>format_json</code> is <code>True</code>/td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Controls whether to escape non-<code>ASCII</code> characters to Unicode. When set to <code>True</code>, all non-<code>ASCII</code> characters will be escaped; <code>False</code> preserves original characters. Only effective when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "1"><code>save_to_audio()</code></td>
+<td rowspan = "1">Save results as wav format file</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>File save path. When it is a directory, the file name follows the input file type naming convention</td>
+<td>None</td>
+</table>
+
+* Additionally, the prediction results can also be obtained through attributes:
+
+<table>
+<thead>
+<tr>
+<th>Attribute</th>
+<th>Attribute Description</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "1"><code>json</code></td>
+<td rowspan = "1">Get prediction results in <code>json</code> format</td>
+</tr>
+
+</table>
+
+For more information on using PaddleX's single-model inference APIs, please refer to the [PaddleX Single-Model Python Script Usage Instructions](../../instructions/model_python_API.en.md).
diff --git a/docs/module_usage/tutorials/speech_modules/text_to_speech_vocoder.md b/docs/module_usage/tutorials/speech_modules/text_to_speech_vocoder.md
new file mode 100644
index 0000000000..ab89d9f269
--- /dev/null
+++ b/docs/module_usage/tutorials/speech_modules/text_to_speech_vocoder.md
@@ -0,0 +1,189 @@
+---
+comments: true
+---
+
+# 语音合成声学模型使用教程
+
+## 一、概述
+语音合成声码器是语音合成技术的关键组件，其特点在于利用信号处理算法将声学模型生成的频谱参数转化为可播放的波形音频，并支持调整音色、音高等音质特征。主要应用于语音合成系统的后端处理、音效优化及个性化语音生成等领域。
+
+## 二、支持模型列表
+
+### Fastspeech Model
+<table>
+
+  <tr>
+    <th >模型</th>
+    <th >模型下载链接</th>
+    <th >训练数据</th>
+    <th>模型存储大小（MB）</th>
+    <th >介绍</th>
+  </tr>
+  <tr>
+    <td>pwgan_csmsc</td>
+    <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/pwgan_csmsc.tar">pwgan_csmsc</a></td>
+    <td>\</td>
+    <td>5.1M</td>
+    <td rowspan="1">Parallel WaveGAN（PWGAN）是日本名古屋大学开发的端到端语音合成声码器，具备高效稳定的波形生成能力。它采用生成对抗网络架构，能实现快速高保真的语音重建，适用于TTS系统后端处理、语音增强等多种场景。</td>
+  </tr>
+</table>
+
+## 三、快速集成
+在快速集成前，首先需要安装 PaddleX 的 wheel 包，wheel的安装方式请参考[PaddleX本地安装教程](../../../installation/installation.md)。完成 wheel 包的安装后，几行代码即可完成多语种语音合成声学模块的推理，可以任意切换该模块下的模型，您也可以将多语种语音合成模块中的模型推理集成到您的项目中。
+运行以下代码前，请您下载[示例npy](https://paddlespeech.bj.bcebos.com/demos/paddlex/mel.npy)到本地。
+
+```python
+from paddlex import create_model
+model = create_model(model_name="pwgan_csmsc")
+output = model.predict(input='./mel.npy'. , batch_size=1)
+for res in output:
+    res.print()
+    res.save_to_json(save_path="./output/res.json")
+```
+运行后，得到的结果为：
+```bash
+{'result': array([-9.085755e-04, ...,  8.858787e-05], dtype=float32)}
+```
+运行结果参数含义如下：
+- `result`: 输出wav音频数据，类型为`numpy.ndarray`。
+
+相关方法、参数等说明如下：
+* `create_model`语音合成声码器（此处以`pwgan_csmsc`为例），具体说明如下：
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>model_name</code></td>
+<td>模型名称</td>
+<td><code>str</code></td>
+<td><code>pwgan_csmsc</code></td>
+<td><code>无</code></td>
+</tr>
+<tr>
+<td><code>model_dir</code></td>
+<td>模型存储路径</td>
+<td><code>str</code></td>
+<td>无</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>模型推理设备</td>
+<td><code>str</code></td>
+<td>支持指定GPU具体卡号，如“gpu:0”，其他硬件具体卡号，如“npu:0”，CPU如“cpu”。</td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>是否启用高性能推理插件。目前暂不支持。</td>
+<td><code>bool</code></td>
+<td>无</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>高性能推理配置。目前暂不支持。</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>无</td>
+<td><code>None</code></td>
+</tr>
+</table>
+
+* 其中，`model_name` 必须指定，指定 `model_name` 后，默认使用 PaddleX 内置的模型参数，在此基础上，指定 `model_dir` 时，使用用户自定义的模型。
+
+* 调用多语种语音识别模型的 `predict()` 方法进行推理预测，`predict()` 方法参数有 `input` 和 `batch_size`，具体说明如下：
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>待预测数据</td>
+<td><code>str</code></td>
+<td>
+  输入的mel谱, 可以是npy文件路径或者tensor类型。
+</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>batch_size</code></td>
+<td>批大小</td>
+<td><code>int</code></td>
+<td>目前仅支持1</td>
+<td>1</td>
+</tr>
+</table>
+
+* 对预测结果进行处理，每个样本的预测结果均为对应的Result对象，支持保存为`json`文件的操作以及保存为`wav`文件的操作:
+
+<table>
+<thead>
+<tr>
+<th>方法</th>
+<th>方法说明</th>
+<th>参数</th>
+<th>参数类型</th>
+<th>参数说明</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">打印结果到终端</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>是否对输出内容进行使用 <code>JSON</code> 缩进格式化</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "1"><code>save_to_audio()</code></td>
+<td rowspan = "1">将结果保存为wav格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，当为目录时，保存文件命名与输入文件类型命名一致</td>
+<td>无</td>
+</table>
+
+* 此外，也支持通过属性获取预测结果，具体如下：
+
+<table>
+<thead>
+<tr>
+<th>属性</th>
+<th>属性说明</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "1"><code>json</code></td>
+<td rowspan = "1">获取预测的<code>json</code>格式的结果</td>
+</tr>
+
+</table>
+
+关于更多 PaddleX 的单模型推理的 API 的使用方法，可以参考[PaddleX单模型Python脚本使用说明](../../instructions/model_python_API.md)。
diff --git a/docs/pipeline_deploy/packaging.en.md b/docs/pipeline_deploy/packaging.en.md
index 505306af3e..b83e403ec4 100644
--- a/docs/pipeline_deploy/packaging.en.md
+++ b/docs/pipeline_deploy/packaging.en.md
@@ -37,7 +37,7 @@ args = parser.parse_args()
 main_file = args.file
 
 user_deps = [dist.metadata["Name"] for dist in importlib.metadata.distributions()]
-deps_all = list(paddlex.utils.deps.DEP_SPECS.keys())
+deps_all = list(paddlex.utils.deps.BASE_DEP_SPECS.keys())
 deps_need = [dep for dep in user_deps if dep in deps_all]
 
 cmd = [
diff --git a/docs/pipeline_deploy/packaging.md b/docs/pipeline_deploy/packaging.md
index e9bc7081c9..f653c3374d 100644
--- a/docs/pipeline_deploy/packaging.md
+++ b/docs/pipeline_deploy/packaging.md
@@ -37,7 +37,7 @@ args = parser.parse_args()
 main_file = args.file
 
 user_deps = [dist.metadata["Name"] for dist in importlib.metadata.distributions()]
-deps_all = list(paddlex.utils.deps.DEP_SPECS.keys())
+deps_all = list(paddlex.utils.deps.BASE_DEP_SPECS.keys())
 deps_need = [dep for dep in user_deps if dep in deps_all]
 
 cmd = [
diff --git a/docs/pipeline_deploy/serving.en.md b/docs/pipeline_deploy/serving.en.md
index 3038a92872..975cc995db 100644
--- a/docs/pipeline_deploy/serving.en.md
+++ b/docs/pipeline_deploy/serving.en.md
@@ -128,135 +128,135 @@ Find the high-stability serving SDK corresponding to the pipeline in the table b
 <tbody>
 <tr>
 <td>PP-ChatOCR-doc v3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General image classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General object detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General instance segmentation</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General semantic segmentation</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Image multi-label classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General image recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Pedestrian attribute recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Vehicle attribute recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Face recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Small object detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Image anomaly detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Human keypoint detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Open vocabulary detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Open vocabulary segmentation</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Rotated object detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>3D multi-modal fusion detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General OCR</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General table recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General table recognition v2</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General layout parsing</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>PP-StructureV3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Formula recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Seal text recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Document image preprocessing</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Time series forecasting</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Time series anomaly detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Time series classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Multilingual speech recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General video classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General video detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Document understanding</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>PaddleOCR-VL</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PaddleOCR-VL_sdk.tar.gz">paddlex_hps_PaddleOCR-VL_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PaddleOCR-VL_sdk.tar.gz">paddlex_hps_PaddleOCR-VL_sdk.tar.gz</a></td>
 </tr>
 </tbody>
 </table>
@@ -314,13 +314,13 @@ First, pull the Docker image as needed:
 - Image supporting deployment with NVIDIA GPU (the machine must have NVIDIA drivers that support CUDA 11.8 installed):
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-gpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.4-gpu
     ```
 
 - CPU-only Image:
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-cpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.4-cpu
     ```
 
 If you need to build the image on your own, please refer to [the `hps` project documentation](https://github.com/PaddlePaddle/PaddleX/blob/develop/deploy/hps/README_en.md#1-image-building)
diff --git a/docs/pipeline_deploy/serving.md b/docs/pipeline_deploy/serving.md
index 9515cfb919..a8e956e1cb 100644
--- a/docs/pipeline_deploy/serving.md
+++ b/docs/pipeline_deploy/serving.md
@@ -128,135 +128,135 @@ paddlex --serve --pipeline image_classification --use_hpip
 <tbody>
 <tr>
 <td>文档场景信息抽取 v3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用图像分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用目标检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用实例分割</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用语义分割</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用图像多标签分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用图像识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>行人属性识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>车辆属性识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>人脸识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>小目标检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>图像异常检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>人体关键点检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>开放词汇检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>开放词汇分割</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>旋转目标检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>3D 多模态融合检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用 OCR</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用表格识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用表格识别 v2</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用版面解析</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用版面解析 v3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>公式识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>印章文本识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>文档图像预处理</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>时序预测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>时序异常检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>时序分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>多语种语音识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用视频分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用视频检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>文档理解</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>PaddleOCR-VL</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PaddleOCR-VL_sdk.tar.gz">paddlex_hps_PaddleOCR-VL_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.4/paddlex_hps_PaddleOCR-VL_sdk.tar.gz">paddlex_hps_PaddleOCR-VL_sdk.tar.gz</a></td>
 </tr>
 </tbody>
 </table>
@@ -314,13 +314,13 @@ paddlex --serve --pipeline image_classification --use_hpip
 - 支持使用 NVIDIA GPU 部署的镜像（机器上需要安装有支持 CUDA 11.8 的 NVIDIA 驱动）：
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-gpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.4-gpu
     ```
 
 - CPU-only 镜像：
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-cpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.4-cpu
     ```
 
 如需自定义构建镜像可参考 [`hps` 项目文档](https://github.com/PaddlePaddle/PaddleX/blob/develop/deploy/hps/README.md#1-镜像构建)。
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
index 2ff08a0beb..cf4afa0784 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
@@ -1608,7 +1608,7 @@ In the above Python script, the following steps are executed:
 <td>
 <ul>
 <li><b>bool</b>: <code>True</code> or <code>False</code>;</li>
-<li><b>None</b>: If set to <code>None</code>, it will default to the initialized parameter value, initialized as <code>True</code>;</li>
+<li><b>None</b>: If set to <code>None</code>, it will default to the initialized parameter value, initialized as <code>False</code>;</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1626,6 +1626,12 @@ In the above Python script, the following steps are executed:
 <td><code>None</code></td>
 </tr>
 <tr>
+<td><code>format_block_content</code></td>
+<td>Whether to format the content in <code>block_content</code> as Markdown. If set to <code>None</code>, the instantiation value is used; otherwise, this parameter takes precedence.</td>
+<td><code>bool|None</code></td>
+<td></td>
+</tr>
+<tr>
 <td><code>layout_threshold</code></td>
 <td>Layout model score threshold</td>
 <td><code>float|dict|None</code></td>
@@ -1888,11 +1894,17 @@ In the above Python script, the following steps are executed:
 <td>
 <ul>
 <li><b>bool</b>: <code>True</code> or <code>False</code>;</li>
-<li><b>None</b>: If set to <code>None</code>, it will default to the initialized parameter value, initialized as <code>False</code>;</li>
+<li><b>None</b>: If set to <code>None</code>, it will default to the initialized parameter value, initialized as <code>True</code>;</li>
 </ul>
 </td>
 <td><code>True</code></td>
 </tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>Layout labels that need to be ignored in Markdown. If set to <code>None</code>, the initialized default value will be used: <code>['number','footnote','header','header_image','footer','footer_image','aside_text']</code></td>
+<td><code>list|None</code></td>
+<td></td>
+</tr>
 </table>
 
 (3) Process the prediction results: The prediction result of each sample is a corresponding Result object, and it supports operations such as printing, saving as an image, and saving as a `json` file:
@@ -1987,6 +1999,12 @@ In the above Python script, the following steps are executed:
 
     - `page_index`: `(Union[int, None])` If the input is a PDF file, this indicates which page of the PDF it is; otherwise, it is `None`.
 
+    - `page_count`: `(Union[int, None])` If the input is a PDF file, it indicates the total number of pages in the PDF; otherwise, it is `None`.
+
+    - `width`: `(int)` The width of the original input image.
+
+    - `height`: `(int)` The height of the original input image.
+
     - `model_settings`: `(Dict[str, bool])` Model parameters required for configuring the pipeline.
 
         - `use_doc_preprocessor`: `(bool)` Controls whether to enable the document preprocessor sub-line.
@@ -1995,6 +2013,7 @@ In the above Python script, the following steps are executed:
         - `use_table_recognition`: `(bool)` Controls whether to enable the table recognition sub-line.
         - `use_formula_recognition`: `(bool)` Controls whether to enable the formula recognition sub-line.
         - `format_block_content`: `(bool)` Controls whether to format the `block_content` into Markdown format
+        - `markdown_ignore_labels`: `(List[str])` Labels of layout regions that need to be ignored in Markdown, defaulting to `['number','footnote','header','header_image','footer','footer_image','aside_text']`
 
     - `parsing_res_list`: `(List[Dict])` A list of parsing results, where each element is a dictionary. The order of the list is the reading order after parsing.
         - `block_bbox`: `(np.ndarray)` The bounding box of the layout area.
@@ -2436,6 +2455,24 @@ To remove the page limit, please add the following configuration to the pipeline
 <td>No</td>
 </tr>
 <tr>
+<td><code>markdownIgnoreLabels</code></td>
+<td><code>array</code> | <code>null</code></td>
+<td>Please refer to the description of the <code>markdown_ignore_labels</code> parameter of the pipeline object's <code>predict</code> method.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>prettifyMarkdown</code></td>
+<td><code>boolean</code></td>
+<td>Whether to output beautified Markdown text. The default is <code>true</code>.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>showFormulaNumber</code></td>
+<td><code>boolean</code></td>
+<td>Whether to include formula numbers in the output Markdown text. The default is <code>false</code>.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>visualize</code></td>
 <td><code>boolean</code> | <code>null</code></td>
 <td>
@@ -2544,7 +2581,8 @@ If neither the request body nor the configuration file is set (If <code>visualiz
 <td>Whether the last element on the current page is the end of a segment.</td>
 </tr>
 </tbody>
-</table></details>
+</table>
+</details>
 
 <details><summary>Multi-language Service Call Example</summary>
 <details>
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
index 8195f203fc..65f3d1d14f 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
@@ -1576,7 +1576,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
-<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>False</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1594,6 +1594,12 @@ for item in markdown_images:
 <td><code>None</code></td>
 </tr>
 <tr>
+<td><code>format_block_content</code></td>
+<td>是否将<code>block_content</code>中的内容格式化为Markdown格式。设置为<code>None</code>表示使用实例化参数，否则该参数优先级更高。</td>
+<td><code>bool|None</code></td>
+<td></td>
+</tr>
+<tr>
 <td><code>layout_threshold</code></td>
 <td>版面模型得分阈值</td>
 <td><code>float|dict|None</code></td>
@@ -1652,7 +1658,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>int</b>：大于 <code>0</code> 的任意整数；</li>
-<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>960</code>；</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>736</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1664,7 +1670,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>str</b>：支持 <code>min</code> 和 <code>max</code>，<code>min</code> 表示保证图像最短边不小于 <code>det_limit_side_len</code>，<code>max</code> 表示保证图像最长边不大于 <code>limit_side_len</code></li>
-<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>max</code>；</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>min</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1716,7 +1722,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>int</b>：大于 <code>0</code> 的任意整数；</li>
-<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>960</code>；</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>736</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1728,7 +1734,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>str</b>：支持 <code>min</code> 和 <code>max</code>，<code>min</code> 表示保证图像最短边不小于 <code>det_limit_side_len</code>，<code>max</code> 表示保证图像最长边不大于 <code>limit_side_len</code></li>
-<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>max</code>；</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>min</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1840,11 +1846,17 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
-<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
 </ul>
 </td>
 <td><code>True</code></td>
 </tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>需要在Markdown中忽略的版面标签。如果设置为<code>None</code>，将使用初始化的默认值:<code>['number','footnote','header','header_image','footer','footer_image','aside_text']</code></td>
+<td><code>list|None</code></td>
+<td></td>
+</tr>
 </table>
 
 </details>
@@ -1947,7 +1959,13 @@ for item in markdown_images:
 - 调用`print()` 方法会将结果打印到终端，打印到终端的内容解释如下：
     - `input_path`: `(str)` 待预测图像或者PDF的输入路径
 
-    - `page_index`: `(Union[int, None])` 如果输入是PDF文件，则表示当前是PDF的第几页，否则为 `None`
+    - `page_index`: `(Union[int, None])` 如果输入是PDF文件，表示当前是PDF的第几页，从0开始，否则为 `None`
+
+    - `page_count`: `(Union[int, None])` 如果输入是PDF文件，表示当前是PDF的总页数，否则为 `None`
+
+    - `width`: `(int)` 原始输入图像的宽度。
+
+    - `height`: `(int)` 原始输入图像的高度。
 
     - `model_settings`: `(Dict[str, bool])` 配置产线所需的模型参数
 
@@ -1957,6 +1975,7 @@ for item in markdown_images:
         - `use_table_recognition`: `(bool)` 控制是否启用表格识别子产线
         - `use_formula_recognition`: `(bool)` 控制是否启用公式识别子产线
         - `format_block_content`: `(bool)` 控制是否将 `block_content` 中的内容格式化为Markdown格式
+        - `markdown_ignore_labels`: `(List[str])` 需要在Markdown中忽略的版面标签，默认为`['number','footnote','header','header_image','footer','footer_image','aside_text']`
 
     - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果字典，仅当`use_doc_preprocessor=True`时存在
         - `input_path`: `(str)` 文档预处理子产线接受的图像路径，当输入为`numpy.ndarray`时，保存为`None`，此处为`None`
@@ -2393,6 +2412,24 @@ for res in output:
 <td>否</td>
 </tr>
 <tr>
+<td><code>markdownIgnoreLabels</code></td>
+<td><code>array</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>markdown_ignore_labels</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>prettifyMarkdown</code></td>
+<td><code>boolean</code></td>
+<td>是否输出美化后的 Markdown 文本。默认为 <code>true</code>。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>showFormulaNumber</code></td>
+<td><code>boolean</code></td>
+<td>输出的 Markdown 文本中是否包含公式编号。默认为 <code>false</code>。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>visualize</code></td>
 <td><code>boolean</code> | <code>null</code></td>
 <td>是否返回可视化结果图以及处理过程中的中间图像等。
@@ -2498,7 +2535,8 @@ for res in output:
 <td>当前页面最后一个元素是否为段结束。</td>
 </tr>
 </tbody>
-</table></details>
+</table>
+</details>
 <details><summary>多语言调用服务示例</summary>
 <details>
 <summary>Python</summary>
@@ -3066,8 +3104,8 @@ SubPipelines:
         module_name: text_detection
         model_name: PP-OCRv5_server_det
         model_dir: null # 替换为微调后的文本测模型权重路径
-        limit_side_len: 960
-        limit_type: max
+        limit_side_len: 736
+        limit_type: min
         max_side_limit: 4000
         thresh: 0.3
         box_thresh: 0.6
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.en.md
index 76c02ba8bf..f897703461 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.en.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.en.md
@@ -15,7 +15,6 @@ Install PaddlePaddle and PaddleX:
 ```shell
 python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
 python -m pip install paddlex
-python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
 ```
 > For Windows users, please use WSL or a Docker container.
 
@@ -519,6 +518,18 @@ If not set, the initialized parameter value will be used.
 <td></td>
 </tr>
 <tr>
+<td><code>merge_layout_blocks</code></td>
+<td>Control whether to merge the layout detection boxes for cross-column or staggered top and bottom columns. If not set, the initialized default value will be used, which defaults to initialization as<code>True</code>.</td>
+<td><code>bool|None</code></td>
+<td></td>
+</tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>Layout labels that need to be ignored in Markdown. If not set, the initialized default value will be used: <code>['number','footnote','header','header_image','footer','footer_image','aside_text']</code></td>
+<td><code>list|None</code></td>
+<td></td>
+</tr>
+<tr>
 <td><code>device</code></td>
 <td>The device used for inference. Supports specifying specific card numbers:<ul>
 <li><b>CPU</b>: For example,<code>cpu</code> indicates using the CPU for inference;</li>
@@ -650,6 +661,18 @@ If not set, the initialized parameter value will be used.
 <td><code>None</code></td>
 </tr>
 <tr>
+<td><code>merge_layout_blocks</code></td>
+<td>Control whether to merge the layout detection boxes for cross-column or staggered top and bottom columns. If not set, the initialized default value will be used, which defaults to initialization as<code>True</code>.</td>
+<td><code>bool</code></td>
+<td></td>
+</tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>Layout labels that need to be ignored in Markdown. If not set, the initialized default value will be used.</td>
+<td><code>str</code></td>
+<td></td>
+</tr>
+<tr>
 <td><code>use_queues</code></td>
 <td>Used to control whether to enable internal queues. When set to <code>True</code>, data loading (such as rendering PDF pages as images), layout detection model processing, and VLM inference will be executed asynchronously in separate threads, with data passed through queues, thereby improving efficiency. This approach is particularly efficient for PDF documents with many pages or directories containing a large number of images or PDF files.</td>
 <td><code>bool|None</code></td>
@@ -697,6 +720,24 @@ If not set, the initialized parameter value will be used.
 <td><code>int|None</code></td>
 <td><code>None</code></td>
 </tr>
+<tr>
+<td><code>max_new_tokens</code></td>
+<td>The maximum number of tokens generated by the VL model.</td>
+<td><code>int|None</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>merge_layout_blocks</code></td>
+<td>Control whether to merge the layout detection boxes for cross-column or staggered top and bottom columns.</td>
+<td><code>bool|None</code></td>
+<td></td>
+</tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>Layout labels that need to be ignored in Markdown.</td>
+<td><code>list|None</code></td>
+<td></td>
+</tr>
 </table>
 </details>
 <details><summary>(3) Process the prediction results: The prediction result for each sample is a corresponding Result object, supporting operations such as printing, saving as an image, and saving as a <code>json</code> file:</summary>
@@ -805,11 +846,19 @@ If not set, the initialized parameter value will be used.
 
     - `page_index`: `(Union[int, None])` If the input is a PDF file, it indicates the current page number of the PDF; otherwise, it is `None`.
 
+    - `page_count`: `(Union[int, None])` If the input is a PDF file, it indicates the total number of pages in the PDF; otherwise, it is `None`.
+
+    - `width`: `(int)` The width of the original input image.
+
+    - `height`: `(int)` The height of the original input image.
+
     - `model_settings`: `(Dict[str, bool])` Model parameters required for configuring PaddleOCR-VL.
         - `use_doc_preprocessor`: `(bool)` Controls whether to enable the document preprocessing sub-pipeline.
         - `use_layout_detection`: `(bool)` Controls whether to enable the layout detection module.
         - `use_chart_recognition`: `(bool)` Controls whether to enable the chart recognition function.
         - `format_block_content`: `(bool)` Controls whether to save the formatted markdown content in `JSON`.
+        - `merge_layout_blocks`: `(bool)` Controls whether to merge the layout frames of multi-column layouts or top-and-bottom alternating column layouts.
+        - `markdown_ignore_labels`: `(List[str])` Labels of layout regions that need to be ignored in Markdown, defaulting to `['number','footnote','header','header_image','footer','footer_image','aside_text']`
 
     - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` A dictionary of document preprocessing results, which exists only when `use_doc_preprocessor=True`.
         - `input_path`: `(str)` The image path accepted by the document preprocessing sub-pipeline. When the input is a `numpy.ndarray`, it is saved as `None`; here, it is `None`.
@@ -1063,7 +1112,10 @@ The following configurations are tailored for scenarios with a one-to-one corres
 **NVIDIA RTX 3060**
 
 - **Server-Side**
-  - vLLM: `gpu-memory-utilization=0.8`
+  - vLLM: `gpu-memory-utilization=0.7`
+  - FastDeploy：
+    - `gpu-memory-utilization: 0.7`
+    - `max-concurrency: 2048`
 
 ## 4. Serving
 
@@ -1265,6 +1317,18 @@ Below are the API references for basic service-based deployment and examples of
 <td>No</td>
 </tr>
 <tr>
+<td><code>useSealRecognition</code></td>
+<td><code>boolean</code>|<code>null</code></td>
+<td>Please refer to the description of the <code>use_seal_recognition</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>useOcrForImageBlock</code></td>
+<td><code>boolean</code>|<code>null</code></td>
+<td>Please refer to the description of the <code>use_ocr_for_image_block</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>layoutThreshold</code></td>
 <td><code>number</code>|<code>object</code>|<code>null</code></td>
 <td>Please refer to the description of the <code>layout_threshold</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
@@ -1289,6 +1353,12 @@ Below are the API references for basic service-based deployment and examples of
 <td>No</td>
 </tr>
 <tr>
+<td><code>layoutShapeMode</code></td>
+<td><code>string</code></td>
+<td>Please refer to the description of the <code>layout_shape_mode</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>promptLabel</code></td>
 <td><code>string</code>|<code>null</code></td>
 <td>Please refer to the description of the <code>prompt_label</code> parameter in the  <code>predict</code> method of the PaddleOCR-VL object.</td>
@@ -1331,6 +1401,30 @@ Below are the API references for basic service-based deployment and examples of
 <td>No</td>
 </tr>
 <tr>
+<td><code>maxNewTokens</code></td>
+<td><code>number</code>|<code>null</code></td>
+<td>Please refer to the description of the <code>max_new_tokens</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>mergeLayoutBlocks</code></td>
+<td><code>boolean</code>|<code>null</code></td>
+<td>Please refer to the description of the <code>merge_layout_blocks</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>markdownIgnoreLabels</code></td>
+<td><code>array</code>|<code>null</code></td>
+<td>Please refer to the description of the <code>markdown_ignore_labels</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>vlmExtraArgs</code></td>
+<td><code>object</code>|<code>null</code></td>
+<td>Please refer to the description of the <code>vlm_extra_args</code> parameter in the <code>predict</code> method of the PaddleOCR-VL object.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>prettifyMarkdown</code></td>
 <td><code>boolean</code></td>
 <td>Whether to output beautified Markdown text. The default is <code>true</code>.</td>
@@ -1343,6 +1437,24 @@ Below are the API references for basic service-based deployment and examples of
 <td>No</td>
 </tr>
 <tr>
+<td><code>restructurePages</code></td>
+<td><code>boolean</code></td>
+<td>Whether to restructure results across multiple pages. The default is <code>false</code>.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>mergeTables</code></td>
+<td><code>boolean</code></td>
+<td>Please refer to the description of the <code>merge_tables</code> parameter in the <code>restructure_pages</code> method of the PaddleOCR-VL object. Valid only when <code>restructurePages</code> is <code>true</code>.</td>
+<td>No</td>
+</tr>
+<tr>
+<td><code>relevelTitles</code></td>
+<td><code>boolean</code></td>
+<td>Please refer to the description of the <code>relevel_titles</code> parameter in the <code>restructure_pages</code> method of the PaddleOCR-VL object. Valid only when <code>restructurePages</code> is <code>true</code>.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>visualize</code></td>
 <td><code>boolean</code>|<code>null</code></td>
 <td>Whether to return visualization result images and intermediate images during the processing.<ul style="margin: 0 0 0 1em; padding-left: 0em;">
@@ -1433,18 +1545,111 @@ Below are the API references for basic service-based deployment and examples of
 <td><code>object</code></td>
 <td>Key-value pairs of relative paths to Markdown images and Base64-encoded images.</td>
 </tr>
+</tbody>
+</table>
+<ul>
+  <li><b><code>restructurePages</code></b></li>
+</ul>
+<p>Restructure results across multiple pages.</p>
+<p><code>POST /restructure-pages</code></p>
+
+<ul>
+  <li>The request body has the following properties:</li>
+</ul>
+
+<table>
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Type</th>
+      <th>Description</th>
+      <th>Required</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><code>pages</code></td>
+      <td><code>array</code></td>
+      <td>An array of pages.</td>
+      <td>Yes</td>
+    </tr>
+    <tr>
+    <td><code>mergeTables</code></td>
+    <td><code>boolean</code></td>
+    <td>Please refer to the description of the <code>merge_tables</code> parameter in the <code>restructure_pages</code> method of the PaddleOCR-VL object.</td>
+    <td>No</td>
+    </tr>
+    <tr>
+    <td><code>relevelTitles</code></td>
+    <td><code>boolean</code></td>
+    <td>Please refer to the description of the <code>relevel_titles</code> parameter in the <code>restructure_pages</code> method of the PaddleOCR-VL object.</td>
+    <td>No</td>
+    </tr>
+    <tr>
+    <td><code>concatenatePages</code></td>
+    <td><code>boolean</code></td>
+    <td>Please refer to the description of the <code>concatenate_pages</code> parameter in the <code>restructure_pages</code> method of the PaddleOCR-VL object.</td>
+    <td>No</td>
+    </tr>
+    <tr>
+    <td><code>prettifyMarkdown</code></td>
+    <td><code>boolean</code></td>
+    <td>Whether to output beautified Markdown text. The default is <code>true</code>.</td>
+    <td>No</td>
+    </tr>
+    <tr>
+    <td><code>showFormulaNumber</code></td>
+    <td><code>boolean</code></td>
+    <td>Whether to include formula numbers in the output Markdown text. The default is <code>false</code>.</td>
+    <td>No</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Each element in <code>pages</code> is an <code>object</code> with the following properties:</p>
+<table>
+<thead>
 <tr>
-<td><code>isStart</code></td>
-<td><code>boolean</code></td>
-<td>Whether the first element on the current page is the start of a paragraph.</td>
+<th>Name</th>
+<th>Type</th>
+<th>Description</th>
 </tr>
+</thead>
+<tbody>
 <tr>
-<td><code>isEnd</code></td>
-<td><code>boolean</code></td>
-<td>Whether the last element on the current page is the end of a paragraph.</td>
+<td><code>prunedResult</code></td>
+<td><code>object</code></td>
+<td>The <code>prunedResult</code> object returned by the <code>infer</code> operation.</td>
+</tr>
+<tr>
+<td><code>markdownImages</code></td>
+<td><code>object</code>|<code>null</code></td>
+<td>The <code>images</code> property of the <code>markdown</code> object returned by the <code>infer</code> operation.</td>
 </tr>
 </tbody>
-</table></details>
+</table>
+
+<ul>
+  <li>When the request is processed successfully, the <code>result</code> field in the response body has the following properties:</li>
+</ul>
+
+<table>
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Type</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><code>layoutParsingResults</code></td>
+      <td><code>array</code></td>
+      <td>The restructured layout parsing results. For the fields that every element contains, please refer to the description of the result returned by the <code>infer</code> operation (excluding visualization result images and intermediate images).</td>
+    </tr>
+  </tbody>
+</table>
+</details>
 <details><summary>Multilingual Service Invocation Example</summary>
 <details>
 <summary>Python</summary>
@@ -1454,7 +1659,7 @@ import base64
 import requests
 import pathlib
 
-API_URL = "http://localhost:8080/layout-parsing" # Service URL
+BASE_URL = "http://localhost:8080"
 
 image_path = "./demo.jpg"
 
@@ -1468,28 +1673,39 @@ payload = {
     "fileType": 1, # File type, 1 indicates an image file
 }
 
-# Call the API
-response = requests.post(API_URL, json=payload)
+response = requests.post(BASE_URL + "/layout-parsing", json=payload)
+assert response.status_code == 200, (response.status_code, response.text)
 
-# Process the returned data from the interface
-assert response.status_code == 200
 result = response.json()["result"]
+pages = []
 for i, res in enumerate(result["layoutParsingResults"]):
-    print(res["prunedResult"])
-    md_dir = pathlib.Path(f"markdown_{i}")
-    md_dir.mkdir(exist_ok=True)
-    (md_dir / "doc.md").write_text(res["markdown"]["text"])
-    for img_path, img in res["markdown"]["images"].items():
-        img_path = md_dir / img_path
-        img_path.parent.mkdir(parents=True, exist_ok=True)
-        img_path.write_bytes(base64.b64decode(img))
-    print(f"Markdown document saved at {md_dir / 'doc.md'}")
+    pages.append({"prunedResult": res["prunedResult"], "markdownImages": res["markdown"].get("images")})
     for img_name, img in res["outputImages"].items():
         img_path = f"{img_name}_{i}.jpg"
         pathlib.Path(img_path).parent.mkdir(exist_ok=True)
         with open(img_path, "wb") as f:
             f.write(base64.b64decode(img))
         print(f"Output image saved at {img_path}")
+
+payload = {
+    "pages": pages,
+    "concatenatePages": True,
+}
+
+response = requests.post(BASE_URL + "/restructure-pages", json=payload)
+assert response.status_code == 200, (response.status_code, response.text)
+
+result = response.json()["result"]
+res = result["layoutParsingResults"][0]
+print(res["prunedResult"])
+md_dir = pathlib.Path("markdown")
+md_dir.mkdir(exist_ok=True)
+(md_dir / "doc.md").write_text(res["markdown"]["text"])
+for img_path, img in res["markdown"]["images"].items():
+    img_path = md_dir / img_path
+    img_path.parent.mkdir(parents=True, exist_ok=True)
+    img_path.write_bytes(base64.b64decode(img))
+print(f"Markdown document saved at {md_dir / 'doc.md'}")
 </code></pre></details>
 
 <details><summary>C++</summary>
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
index e3241a1708..051d0e3d00 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
@@ -15,7 +15,6 @@ PaddleOCR-VL 是一款先进、高效的文档解析模型，专为文档中的
 ```shell
 python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
 python -m pip install paddlex
-python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
 ```
 > 对于 Windows 用户，请使用 WSL 或者 Docker 进行环境搭建。
 
@@ -543,6 +542,18 @@ for item in markdown_images:
 <td><code>None</code></td>
 </tr>
 <tr>
+<td><code>merge_layout_blocks</code></td>
+<td>控制是否对跨栏或上下交错分栏的版面检测框进行合并。如果设置为<code>None</code>，将使用初始化的默认值，默认初始化为<code>True</code>。</td>
+<td><code>bool|None</code></td>
+<td></td>
+</tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>需要在Markdown中忽略的版面标签。如果设置为<code>None</code>，将使用初始化的默认值:<code>['number','footnote','header','header_image','footer','footer_image','aside_text']</code></td>
+<td><code>list|None</code></td>
+<td></td>
+</tr>
+<tr>
 <td><code>device</code></td>
 <td>用于推理的设备。支持指定具体卡号：
 <ul>
@@ -682,6 +693,18 @@ MKL-DNN 缓存容量。
 <td><code>None</code></td>
 </tr>
 <tr>
+<td><code>merge_layout_blocks</code></td>
+<td>控制是否对跨栏或上下交错分栏的版面检测框进行合并。如果不设置，将使用初始化的默认值，默认初始化为<code>True</code>。</td>
+<td><code>bool</code></td>
+<td></td>
+</tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>需要在Markdown中忽略的版面标签。如果不设置，将使用初始化的参数值。</td>
+<td><code>str</code></td>
+<td></td>
+</tr>
+<tr>
 <td><code>use_queues</code></td>
 <td>用于控制是否启用内部队列。当设置为 <code>True</code> 时，数据加载（如将 PDF 页面渲染为图像）、版面检测模型处理以及 VLM 推理将分别在独立线程中异步执行，通过队列传递数据，从而提升效率。对于页数较多的 PDF 文档，或是包含大量图像或 PDF 文件的目录，这种方式尤其高效。</td>
 <td><code>bool|None</code></td>
@@ -729,6 +752,24 @@ MKL-DNN 缓存容量。
 <td><code>int|None</code></td>
 <td><code>None</code></td>
 </tr>
+<tr>
+<td><code>max_new_tokens</code></td>
+<td>VL模型生成的最大token数。</td>
+<td><code>int|None</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>merge_layout_blocks</code></td>
+<td>控制是否对跨栏或上下交错分栏的版面检测框进行合并。</td>
+<td><code>bool|None</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>markdown_ignore_labels</code></td>
+<td>需要在Markdown中忽略的版面标签。</td>
+<td><code>list|None</code></td>
+<td><code>None</code></td>
+</tr>
 </table>
 </details>
 
@@ -835,7 +876,13 @@ MKL-DNN 缓存容量。
 - 调用`print()` 方法会将结果打印到终端，打印到终端的内容解释如下：
     - `input_path`: `(str)` 待预测图像或者PDF的输入路径
 
-    - `page_index`: `(Union[int, None])` 如果输入是PDF文件，则表示当前是PDF的第几页，否则为 `None`
+    - `page_index`: `(Union[int, None])` 如果输入是PDF文件，表示当前是PDF的第几页，从0开始，否则为 `None`
+
+    - `page_count`: `(Union[int, None])` 如果输入是PDF文件，表示当前是PDF的总页数，否则为 `None`
+
+    - `width`: `(int)` 原始输入图像的宽度。
+
+    - `height`: `(int)` 原始输入图像的高度。
 
     - `model_settings`: `(Dict[str, bool])` 配置 PaddleOCR-VL 所需的模型参数
 
@@ -843,6 +890,8 @@ MKL-DNN 缓存容量。
         - `use_layout_detection`: `(bool)` 控制是否启用版面检测模块
         - `use_chart_recognition`: `(bool)` 控制是否开启图表识别功能
         - `format_block_content`: `(bool)` 控制是否在`JSON`中保存格式化后的markdown内容
+        - `merge_layout_blocks`: `(bool)` 控制是否对多栏布局或上下交错分栏的版面框进行合并
+        - `markdown_ignore_labels`: `(List[str])` 需要在Markdown中忽略的版面标签，默认为`['number','footnote','header','header_image','footer','footer_image','aside_text']`
 
     - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果dict，仅当`use_doc_preprocessor=True`时存在
         - `input_path`: `(str)` 文档预处理子接受的图像路径，当输入为`numpy.ndarray`时，保存为`None`，此处为`None`
@@ -1104,7 +1153,10 @@ PaddleX 会将来自单张或多张输入图像中的子图分组并对服务器
 **NVIDIA RTX 3060**
 
 - **服务端**
-  - vLLM：`gpu-memory-utilization=0.8`
+    - vLLM：`gpu-memory-utilization: 0.7`
+    - FastDeploy：
+        - `gpu-memory-utilization: 0.7`
+        - `max-concurrency: 2048`
 
 
 ## 4. 服务化部署
@@ -1309,6 +1361,18 @@ INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
 <td>否</td>
 </tr>
 <tr>
+<td><code>useSealRecognition</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>use_seal_recognition</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>useOcrForImageBlock</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>use_ocr_for_image_block</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>layoutThreshold</code></td>
 <td><code>number</code> | <code>object</code> | </code><code>null</code></td>
 <td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>layout_threshold</code> 参数相关说明。</td>
@@ -1333,6 +1397,12 @@ INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
 <td>否</td>
 </tr>
 <tr>
+<td><code>layoutShapeMode</code></td>
+<td><code>string</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>layout_shape_mode</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>promptLabel</code></td>
 <td><code>string</code> | <code>null</code></td>
 <td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>prompt_label</code> 参数相关说明。</td>
@@ -1375,6 +1445,30 @@ INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
 <td>否</td>
 </tr>
 <tr>
+<td><code>maxNewTokens</code></td>
+<td><code>number</code> | <code>null</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>max_new_tokens</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>mergeLayoutBlocks</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>merge_layout_blocks</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>markdownIgnoreLabels</code></td>
+<td><code>array</code> | <code>null</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>mardown_ignore_labels</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>vlmExtraArgs</code></td>
+<td><code>object</code> | <code>null</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>vlm_extra_args</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>prettifyMarkdown</code></td>
 <td><code>boolean</code></td>
 <td>是否输出美化后的 Markdown 文本。默认为 <code>true</code>。</td>
@@ -1387,6 +1481,24 @@ INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
 <td>否</td>
 </tr>
 <tr>
+<td><code>restructurePages</code></td>
+<td><code>boolean</code></td>
+<td>是否重构多页结果。默认为 <code>false</code>。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>mergeTables</code></td>
+<td><code>boolean</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>restructure_pages</code> 方法的 <code>merge_table</code> 参数相关说明。仅当<code>restructurePages</code>为<code>true</code>时生效。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>relevelTitles</code></td>
+<td><code>boolean</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>restructure_pages</code> 方法的 <code>relevel_titles</code> 参数相关说明。仅当<code>restructurePages</code>为<code>true</code>时生效。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>visualize</code></td>
 <td><code>boolean</code> | <code>null</code></td>
 <td>是否返回可视化结果图以及处理过程中的中间图像等。
@@ -1481,18 +1593,107 @@ INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
 <td><code>object</code></td>
 <td>Markdown图片相对路径和Base64编码图像的键值对。</td>
 </tr>
+</tbody>
+</table>
+<ul>
+<li><b><code>restructurePages</code></b></li>
+</ul>
+<p>重构多页结果。</p>
+<p><code>POST /restructure-pages</code></p>
+<ul>
+<li>请求体的属性如下：</li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+<th>是否必填</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>pages</code></td>
+<td><code>array</code></td>
+<td>页面数组。
+</td>
+<td>是</td>
+</tr>
+<tr>
+<td><code>mergeTables</code></td>
+<td><code>boolean</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>restructure_pages</code> 方法的 <code>merge_tables</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
 <tr>
-<td><code>isStart</code></td>
+<td><code>relevelTitles</code></td>
 <td><code>boolean</code></td>
-<td>当前页面第一个元素是否为段开始。</td>
+<td>请参阅PaddleOCR-VL对象中 <code>restructure_pages</code> 方法的 <code>relevel_titles</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>concatenatePages</code></td>
+<td><code>boolean</code></td>
+<td>请参阅PaddleOCR-VL对象中 <code>restructure_pages</code> 方法的 <code>concatenate_pages</code> 参数相关说明。</td>
+<td>否</td>
 </tr>
 <tr>
-<td><code>isEnd</code></td>
+<td><code>prettifyMarkdown</code></td>
 <td><code>boolean</code></td>
-<td>当前页面最后一个元素是否为段结束。</td>
+<td>是否输出美化后的 Markdown 文本。默认为 <code>true</code>。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>showFormulaNumber</code></td>
+<td><code>boolean</code></td>
+<td>输出的 Markdown 文本中是否包含公式编号。默认为 <code>false</code>。</td>
+<td>否</td>
+</tr>
+</tbody>
+</table>
+<p><code>pages</code>中的每个元素为一个<code>object</code>，具有如下属性：</p>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>prunedResult</code></td>
+<td><code>object</code></td>
+<td>对应<code>infer</code>操作返回的<code>prunedResult</code>对象。</td>
+</tr>
+<tr>
+<td><code>markdownImages</code></td>
+<td><code>object</code>|<code>null</code></td>
+<td>对应<code>infer</code>操作返回的<code>markdown</code>对象的<code>images</code>属性。</td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li>请求处理成功时，响应体的<code>result</code>具有如下属性：</li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>layoutParsingResults</code></td>
+<td><code>array</code></td>
+<td>重构后的版面解析结果。其中每个元素包含的字段请参见对<code>infer</code>操作返回结果的说明（不含可视化结果图和中间图像）。</td>
 </tr>
 </tbody>
-</table></details>
+</table>
+</details>
 <details><summary>多语言调用服务示例</summary>
 <details>
 <summary>Python</summary>
@@ -1502,7 +1703,7 @@ import base64
 import requests
 import pathlib
 
-API_URL = "http://localhost:8080/layout-parsing" # 服务URL
+BASE_URL = "http://localhost:8080"
 
 image_path = "./demo.jpg"
 
@@ -1516,28 +1717,39 @@ payload = {
     "fileType": 1, # 文件类型，1表示图像文件
 }
 
-# 调用API
-response = requests.post(API_URL, json=payload)
+response = requests.post(BASE_URL + "/layout-parsing", json=payload)
+assert response.status_code == 200, (response.status_code, response.text)
 
-# 处理接口返回数据
-assert response.status_code == 200
 result = response.json()["result"]
+pages = []
 for i, res in enumerate(result["layoutParsingResults"]):
-    print(res["prunedResult"])
-    md_dir = pathlib.Path(f"markdown_{i}")
-    md_dir.mkdir(exist_ok=True)
-    (md_dir / "doc.md").write_text(res["markdown"]["text"])
-    for img_path, img in res["markdown"]["images"].items():
-        img_path = md_dir / img_path
-        img_path.parent.mkdir(parents=True, exist_ok=True)
-        img_path.write_bytes(base64.b64decode(img))
-    print(f"Markdown document saved at {md_dir / 'doc.md'}")
+    pages.append({"prunedResult": res["prunedResult"], "markdownImages": res["markdown"].get("images")})
     for img_name, img in res["outputImages"].items():
         img_path = f"{img_name}_{i}.jpg"
         pathlib.Path(img_path).parent.mkdir(exist_ok=True)
         with open(img_path, "wb") as f:
             f.write(base64.b64decode(img))
         print(f"Output image saved at {img_path}")
+
+payload = {
+    "pages": pages,
+    "concatenatePages": True,
+}
+
+response = requests.post(BASE_URL + "/restructure-pages", json=payload)
+assert response.status_code == 200, (response.status_code, response.text)
+
+result = response.json()["result"]
+res = result["layoutParsingResults"][0]
+print(res["prunedResult"])
+md_dir = pathlib.Path("markdown")
+md_dir.mkdir(exist_ok=True)
+(md_dir / "doc.md").write_text(res["markdown"]["text"])
+for img_path, img in res["markdown"]["images"].items():
+    img_path = md_dir / img_path
+    img_path.parent.mkdir(parents=True, exist_ok=True)
+    img_path.write_bytes(base64.b64decode(img))
+print(f"Markdown document saved at {md_dir / 'doc.md'}")
 </code></pre></details>
 
 <details><summary>C++</summary>
diff --git a/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.en.md b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.en.md
new file mode 100644
index 0000000000..676388aef6
--- /dev/null
+++ b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.en.md
@@ -0,0 +1,245 @@
+---
+comments: true
+---
+
+# Text to Speech pipeline Tutorial
+
+## 1. Introduction to Text to Speech pipeline
+Text to Speech is a cutting-edge technology capable of converting computer-generated text information into natural and fluent human speech signals in real-time. This technology has been deeply applied across multiple domains including virtual assistants, accessibility services, navigation announcements, and media entertainment, significantly enhancing human-computer interaction experiences and enabling highly natural voice output in cross-linguistic scenarios.
+
+<p><b>Text to Speech Model:</b></p>
+<table>
+   <tr>
+     <th >Model</th>
+     <th >Model Download Link</th>
+     <th >Training Data</th>
+     <th>Model Storage Size (MB)</th>
+     <th >Introduction</th>
+   </tr>
+   <tr>
+     <td>fastspeech2_csmsc_pwgan_csmsc</td>
+     <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/fastspeech2_csmsc.tar">fastspeech2_csmsc</a><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/pwgan_csmsc.tar">pwgan_csmsc</a></td>
+     <td >/</td>
+     <td>768.1</td>
+     <td rowspan="5">FastSpeech2 is an end-to-end text-to-speech (TTS) model developed by Microsoft, featuring efficient and stable prosody control capabilities. Utilizing a non-autoregressive architecture, it enables fast and high-quality speech synthesis, making it suitable for various applications such as virtual assistants and audiobook production.</td>
+   </tr>
+ </table>
+
+## 2. Quick Start
+PaddleX supports experiencing the multilingual speech recognition pipeline locally using the command line or Python.
+
+Before using the multilingual speech recognition pipeline locally, please ensure that you have completed the installation of the PaddleX wheel package according to the [PaddleX Local Installation Guide](../../../installation/installation.en.md). If you wish to selectively install dependencies, please refer to the relevant instructions in the installation guide. The dependency group corresponding to this pipeline is `speech`.
+
+### 2.1 Local Experience
+
+#### 2.1.1 Command Line Experience
+PaddleX supports experiencing the text to speech pipeline locally using the command line or Python.
+
+```bash
+paddlex --pipeline text_to_speech \
+        --input "今天天气真的很好"
+```
+
+The relevant parameter descriptions can be found in the parameter descriptions in [2.1.2 Integration via Python Script]().
+
+After running, the result will be printed to the terminal, as follows:
+
+```plaintext
+{'res': {'result': array([-8.118157e-04, ...,  6.217696e-05], shape=(38700,), dtype=float32)}}
+```
+
+The explanation of the result parameters can refer to the result explanation in [2.1.2 Integration with Python Script](#212-integration-with-python-script).。
+
+#### 2.1.2 Integration with Python Script
+
+The above command line is for quickly experiencing and viewing the effect. Generally speaking, in a project, it is often necessary to integrate through code. You can complete the rapid inference of the pipeline with just a few lines of code. The inference code is as follows:
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="text_to_speech")
+
+output = pipeline.predict(
+    "今天天气真的很好"
+)
+
+for res in output:
+    print(res)
+    res.print()
+    res.save_to_audio("./output/test.wav")
+    res.save_to_json("./output")
+```
+
+In the above Python script, the following steps are executed:
+
+（1）The <code>text to speech</code> pipeline object is instantiated through <code>create_pipeline()</code>. The specific parameter descriptions are as follows:
+
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Parameter Description</th>
+<th>Parameter Type</th>
+<th>Default</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>pipeline</code></td>
+<td>The name of the pipeline or the path to the pipeline configuration file. If it is the pipeline name, it must be a pipeline supported by PaddleX.</td>
+<td><code>str</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>The inference device for the pipeline. It supports specifying the specific card number of the GPU, such as "gpu:0", the specific card number of other hardware, such as "npu:0"</td>
+<td><code>str</code></td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>Whether to enable the high-performance inference plugin. If set to <code>None</code>, the setting from the configuration file or <code>config</code> will be used. Not supported for now.</td>
+<td><code>bool</code> | <code>None</code></td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>High-performance inference configuration. Not supported for now.</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>None</td>
+</tr>
+</tbody>
+</table>
+
+（2）The <code>predict()</code> method of the <code>text to speech</code> pipeline object is called to perform inference and prediction. This method will return a <code>generator</code>. Below are the parameters and their descriptions for the <code>predict()</code> method:
+
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Parameter Description</th>
+<th>Parameter Type</th>
+<th>Options</th>
+<th>Default</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>Data to be predicted</td>
+<td><code>str</code></td>
+<td>
+<ul>
+  <li><b>File path</b>, such as the local path of an text file:<code>/root/data/text.txt</code></li>
+  <li><b>Text to be synthesized</b>, such as<code>今天天气真不错</code></li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+</tbody>
+</table>
+
+（3）Process the prediction results. The prediction result for each sample is of the AudioResult type and supports operations such as printing, saving as an audio, and saving as a `json` file:
+
+<table>
+<thead>
+<tr>
+<th>Method</th>
+<th>Method Descrition</th>
+<th>Parameter</th>
+<th>Parameter type</th>
+<th>Parameter Description</th>
+<th>Default</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">Print the result to the terminal</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>Whether to format the output content using <code>JSON</code> indentation</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td><td>Specify the indentation level to beautify the output <code>JSON</code> data, making it more readable. Effective only when <code>format_json</code> is <code>True</code></td></td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Control whether to escape non-<code>ASCII</code> characters to <code>Unicode</code>. When set to <code>True</code>, all non-<code>ASCII</code> characters will be escaped; <code>False</code> will retain the original characters. Effective only when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "3"><code>save_to_json()</code></td>
+<td rowspan = "3">Save the result as a JSON file</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>Path to save the file. When it is a directory, the saved file name is consistent with the input file type naming</td>
+<td>None</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>Specify the indentation level to beautify the output <code>JSON</code> data, making it more readable. Effective only when <code>format_json</code> is <code>True</code></td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>Control whether to escape non-<code>ASCII</code> characters to <code>Unicode</code>. When set to <code>True</code>, all non-<code>ASCII</code> characters will be escaped; <code>False</code> will retain the original characters. Effective only when <code>format_json</code> is <code>True</code></td>
+<td><code>False</code></td>
+</tr>
+
+<tr>
+<td rowspan = "1"><code>save_to_audio()</code></td>
+<td rowspan = "1">Save the result as a wav file</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>The saved file path. When it is a directory, the saved file name is consistent with the input file type name.</td>
+<td>None</td>
+</tr>
+
+
+</table>
+
+- Calling the `print()` method will print the result to the terminal, with the printed content explained as follows:
+
+- Calling the `save_to_audio()` method will save the above content to the specified `save_path`.
+
+<!-- 此外，您可以获取 text_to_speech 产线配置文件，并加载配置文件进行预测。可执行如下命令将结果保存在 `my_path` 中：
+
+```
+paddlex --get_pipeline_config multilingual_speech_recognition --save_path ./my_path
+``` -->
+
+Once you have the configuration file, you can customize the text_to_speech pipeline configuration by modifying the `pipeline` parameter in the `create_pipeline` method to the path to the pipeline configuration file. An example is as follows:
+
+For example, if your configuration file is saved at `./my_path/text_to_speech.yaml`, you only need to execute:
+
+```python
+from paddlex import create_pipeline
+pipeline = create_pipeline(pipeline="./my_path/text_to_speech.yaml")
+output = pipeline.predict(input="今天天气真的很好")
+for res in output:
+    res.print()
+    res.save_to_json("./output/")
+    res.save_to_audio("./output/test.wav")
+```
+
+<b>Note:</b> The parameters in the configuration file are the initialization parameters for the pipeline. If you want to change the initialization parameters of the <code>text to speech</code> pipeline, you can directly modify the parameters in the configuration file and load the configuration file for prediction. Additionally, CLI prediction also supports passing in a configuration file, simply specify the path of the configuration file with <code>--pipeline</code>.
+
+## 3. Development Integration/Deployment
+
+If the pipeline meets your requirements for inference speed and accuracy, you can directly proceed with development integration/deployment.
+
+If you need to apply the pipeline directly in your Python project, you can refer to the example code in [2.2.2 Python Script Method](#222-python脚本方式集成).
+
+In addition, PaddleX also provides three other deployment methods, which are detailed as follows:
+
+🚀 <b>High-Performance Inference</b>: In actual production environments, many applications have strict performance requirements for deployment strategies, especially in terms of response speed, to ensure the efficient operation of the system and the smoothness of the user experience. To this end, PaddleX provides a high-performance inference plugin, which aims to deeply optimize the performance of model inference and pre/post-processing to achieve significant acceleration of the end-to-end process. For detailed high-performance inference procedures, please refer to the [PaddleX High-Performance Inference Guide](../../../pipeline_deploy/high_performance_inference.en.md).
+
+☁️ <b>Serving Deployment</b>: Serving Deployment is a common deployment form in actual production environments. By encapsulating inference functions as services, clients can access these services through network requests to obtain inference results. PaddleX supports multiple pipeline serving deployment solutions. For detailed pipeline serving deployment procedures, please refer to the [PaddleX Serving Deployment Guide](../../../pipeline_deploy/serving.en.md).
+
+📱 <b>On-Device Deployment</b>: Edge deployment is a method that places computational and data processing capabilities directly on user devices, allowing them to process data without relying on remote servers. PaddleX supports deploying models on edge devices such as Android. For detailed procedures, please refer to the [PaddleX On-Device Deployment Guide](../../../pipeline_deploy/on_device_deployment.en.md).
diff --git a/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.md b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.md
new file mode 100644
index 0000000000..e36fef0524
--- /dev/null
+++ b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.md
@@ -0,0 +1,249 @@
+---
+comments: true
+---
+
+# 语音合成产线使用教程
+
+## 1. 语音合成产线介绍
+语音合成​​是一种前沿技术，能够将计算机生成的文本信息实时转换为自然流畅的人类语音信号。该技术已在智能助手、无障碍服务、导航播报、媒体娱乐等多个领域深度应用，显著提升人机交互体验，实现跨语言场景的高自然度语音输出。
+
+
+> 推理耗时仅包含模型推理耗时，不包含前后处理耗时。
+
+<p><b>语音合成模型：</b></p>
+<table>
+   <tr>
+     <th >模型</th>
+     <th >模型下载链接</th>
+     <th >训练数据</th>
+     <th>模型存储大小（MB）</th>
+     <th >介绍</th>
+   </tr>
+   <tr>
+     <td>fastspeech2_csmsc_pwgan_csmsc</td>
+     <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/fastspeech2_csmsc.tar">fastspeech2_csmsc</a><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/pwgan_csmsc.tar">pwgan_csmsc</a></td>
+     <td >/</td>
+     <td>768.1</td>
+     <td rowspan="5">FastSpeech2 是微软开发的端到端文本转语音（TTS）模型，具备高效稳定的韵律控制能力。它采用非自回归架构，能实现快速高质量的语音合成，适用于虚拟助手、有声读物等多种场景。</td>
+   </tr>
+ </table>
+
+## 2. 快速开始
+PaddleX 支持在本地使用命令行或 Python 体验语音合成产线的效果。
+
+在本地使用语音合成产线前，请确保您已经按照[PaddleX本地安装教程](../../../installation/installation.md)完成了 PaddleX 的 wheel 包安装。如果您希望选择性安装依赖，请参考安装教程中的相关说明。该产线对应的依赖分组为 `speech`。
+
+### 2.1 本地体验
+
+#### 2.1.1 命令行方式体验
+一行命令即可快速体验语音合成产线效果
+
+```bash
+paddlex --pipeline text_to_speech \
+        --input "今天天气真的很好"
+```
+
+相关的参数说明可以参考[2.1.2 Python脚本方式集成](#212-python脚本方式集成)中的参数说明。
+
+运行后，会将结果打印到终端上，结果如下：
+
+```plaintext
+{'res': {'result': array([-8.118157e-04, ...,  6.217696e-05], shape=(38700,), dtype=float32)}}
+```
+
+运行结果参数说明可以参考[2.1.2 Python脚本方式集成](#212-python脚本方式集成)中的结果解释。
+
+#### 2.1.2 Python脚本方式集成
+
+上述命令行是为了快速体验查看效果，一般来说，在项目中，往往需要通过代码集成，您可以通过几行代码即可完成产线的快速推理，推理代码如下：
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="text_to_speech")
+
+output = pipeline.predict(
+    "今天天气真的很好"
+)
+
+for res in output:
+    print(res)
+    res.print()
+    res.save_to_audio("./output/test.wav")
+    res.save_to_json("./output")
+```
+
+在上述 Python 脚本中，执行了如下几个步骤：
+
+（1）通过 `create_pipeline()` 实例化 text_to_speech 产线对象：具体参数说明如下：
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>pipeline</code></td>
+<td>产线名称或是产线配置文件路径。如为产线名称，则必须为 PaddleX 所支持的产线。</td>
+<td><code>str</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>产线推理设备。支持指定GPU具体卡号，如“gpu:0”，其他硬件具体卡号，如“npu:0”，CPU如“cpu”。</td>
+<td><code>str</code></td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>是否启用高性能推理插件。如果为 <code>None</code>，则使用配置文件或 <code>config</code> 中的配置。目前暂不支持。</td>
+<td><code>bool</code> | <code>None</code></td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>高性能推理配置。目前暂不支持。</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>无</td>
+</tr>
+</tbody>
+</table>
+
+（2）调用 text_to_speech 产线对象的 `predict()` 方法进行推理预测。该方法将返回一个 `generator`。以下是 `predict()` 方法的参数及其说明：
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>待预测数据</td>
+<td><code>str</code></td>
+<td>
+<ul>
+  <li><b>文件路径</b>，如语音文件的本地路径：<code>/root/data/text.txt</code></li>
+  <li><b>合成的文字</b>，如<code>今天天气真不错</code></li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+</tbody>
+</table>
+
+（3）对预测结果进行处理，每个样本的预测结果均为对应的Result对象，且支持打印、保存为音频、保存为`json`文件的操作:
+
+<table>
+<thead>
+<tr>
+<th>方法</th>
+<th>方法说明</th>
+<th>参数</th>
+<th>参数类型</th>
+<th>参数说明</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td rowspan = "3"><code>print()</code></td>
+<td rowspan = "3">打印结果到终端</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>是否对输出内容进行使用 <code>JSON</code> 缩进格式化</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan = "3"><code>save_to_json()</code></td>
+<td rowspan = "3">将结果保存为json格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，当为目录时，保存文件命名与输入文件类型命名一致</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+
+<tr>
+<td rowspan = "1"><code>save_to_audio()</code></td>
+<td rowspan = "1">将结果保存为wav格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，当为目录时，保存文件命名与输入文件类型命名一致</td>
+<td>无</td>
+</tr>
+
+
+</table>
+
+- 调用`print()` 方法会将结果打印到终端
+
+- 调用`save_to_audio()` 方法会将上述内容保存到指定的`save_path`中
+
+<!-- 此外，您可以获取 text_to_speech 产线配置文件，并加载配置文件进行预测。可执行如下命令将结果保存在 `my_path` 中：
+
+```
+paddlex --get_pipeline_config multilingual_speech_recognition --save_path ./my_path
+``` -->
+
+若您获取了配置文件，即可对 text_to_speech 产线各项配置进行自定义，只需要修改 `create_pipeline` 方法中的 `pipeline` 参数值为产线配置文件路径即可。示例如下：
+
+例如，若您的配置文件保存在 `./my_path/text_to_speech.yaml` ，则只需执行：
+
+```python
+from paddlex import create_pipeline
+pipeline = create_pipeline(pipeline="./my_path/text_to_speech.yaml")
+output = pipeline.predict(input="今天天气真的很好")
+for res in output:
+    res.print()
+    res.save_to_json("./output/")
+    res.save_to_audio("./output/test.wav")
+```
+
+<b>注：</b> 配置文件中的参数为产线初始化参数，如果希望更改 text_to_speech 产线初始化参数，可以直接修改配置文件中的参数，并加载配置文件进行预测。同时，CLI 预测也支持传入配置文件，`--pipeline` 指定配置文件的路径即可。
+
+## 3. 开发集成/部署
+
+如果产线可以达到您对产线推理速度和精度的要求，您可以直接进行开发集成/部署。
+
+若您需要将产线直接应用在您的Python项目中，可以参考 [2.2.2 Python脚本方式](#222-python脚本方式集成)中的示例代码。
+
+此外，PaddleX 也提供了其他三种部署方式，详细说明如下：
+
+🚀 <b>高性能推理</b>：在实际生产环境中，许多应用对部署策略的性能指标（尤其是响应速度）有着较严苛的标准，以确保系统的高效运行与用户体验的流畅性。为此，PaddleX 提供高性能推理插件，旨在对模型推理及前后处理进行深度性能优化，实现端到端流程的显著提速，详细的高性能推理流程请参考[PaddleX高性能推理指南](../../../pipeline_deploy/high_performance_inference.md)。
+
+☁️ <b>服务化部署</b>：服务化部署是实际生产环境中常见的一种部署形式。通过将推理功能封装为服务，客户端可以通过网络请求来访问这些服务，以获取推理结果。PaddleX 支持多种产线服务化部署方案，详细的产线服务化部署流程请参考[PaddleX服务化部署指南](../../../pipeline_deploy/serving.md)。
+
+📱 <b>端侧部署</b>：端侧部署是一种将计算和数据处理功能放在用户设备本身上的方式，设备可以直接处理数据，而不需要依赖远程的服务器。PaddleX 支持将模型部署在 Android 等端侧设备上，详细的端侧部署流程请参考[PaddleX端侧部署指南](../../../pipeline_deploy/on_device_deployment.md)。
+您可以根据需要选择合适的方式部署模型产线，进而进行后续的 AI 应用集成。
diff --git a/mkdocs.yml b/mkdocs.yml
index 907cb87bad..28ef9ea9d4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -373,6 +373,7 @@ nav:
          - PaddleX产线命令行使用说明: pipeline_usage/instructions/pipeline_CLI_usage.md
          - PaddleX产线Python脚本使用说明: pipeline_usage/instructions/pipeline_python_API.md
          - 产线并行推理: pipeline_usage/instructions/parallel_inference.md
+         - 产线推理 Benchmark: pipeline_usage/instructions/benchmark.md
   - 单功能模块使用教程:
        - OCR:
          - 文本检测模块: module_usage/tutorials/ocr_modules/text_detection.md
@@ -424,7 +425,6 @@ nav:
          - PaddleX通用模型配置文件参数说明: module_usage/instructions/config_parameters_common.md
          - PaddleX时序任务模型配置文件参数说明: module_usage/instructions/config_parameters_time_series.md
          - 模型推理 Benchmark: module_usage/instructions/benchmark.md
-         - 产线推理 Benchmark: pipeline_usage/instructions/benchmark.md
   - 模型产线部署:
        - 高性能推理: pipeline_deploy/high_performance_inference.md
        - 服务化部署: pipeline_deploy/serving.md
diff --git a/paddlex/configs/modules/doc_vlm/PaddleOCR-VL-0.9B.yaml b/paddlex/configs/modules/doc_vlm/PaddleOCR-VL-0.9B.yaml
new file mode 100644
index 0000000000..3fee85d193
--- /dev/null
+++ b/paddlex/configs/modules/doc_vlm/PaddleOCR-VL-0.9B.yaml
@@ -0,0 +1,12 @@
+Global:
+  model: PaddleOCR-VL-0.9B
+  mode: predict # only support predict
+  device: gpu:0
+  output: "output"
+
+Predict:
+  batch_size: 1
+  model_dir: "/path/to/PaddleOCR-VL-0.9B"
+  input:
+    image: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/medal_table.png"
+    query: "Table Recognition:"
diff --git a/paddlex/configs/modules/layout_detection/PP-DocLayoutV2.yaml b/paddlex/configs/modules/layout_detection/PP-DocLayoutV2.yaml
new file mode 100644
index 0000000000..67f7da454b
--- /dev/null
+++ b/paddlex/configs/modules/layout_detection/PP-DocLayoutV2.yaml
@@ -0,0 +1,13 @@
+Global:
+  model: PP-DocLayoutV2
+  mode: predict # only support predict
+  dataset_dir: "/paddle/dataset/paddlex/layout/det_layout_examples"
+  device: gpu:0
+  output: "output"
+
+Predict:
+  batch_size: 1
+  model_dir: "PP-DocLayoutV2"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout.jpg"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_to_pinyin/G2PWModel.yml b/paddlex/configs/modules/text_to_pinyin/G2PWModel.yml
new file mode 100644
index 0000000000..e05af7b031
--- /dev/null
+++ b/paddlex/configs/modules/text_to_pinyin/G2PWModel.yml
@@ -0,0 +1,11 @@
+Global:
+  model: G2PWModel
+  mode: predict # only support predict
+  device: gpu:0
+  output: "output"
+
+Predict:
+  batch_size: 1
+  input: "欢迎使用飞桨"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_to_speech_acoustic/fastspeech2_csmsc.yaml b/paddlex/configs/modules/text_to_speech_acoustic/fastspeech2_csmsc.yaml
new file mode 100644
index 0000000000..5cd6cb9c36
--- /dev/null
+++ b/paddlex/configs/modules/text_to_speech_acoustic/fastspeech2_csmsc.yaml
@@ -0,0 +1,12 @@
+Global:
+  model: fastspeech2_csmsc
+  mode: predict # only support predict
+  device: gpu:0
+  output: "output"
+
+Predict:
+  batch_size: 1
+  model_dir: "fastspeech2csmsc"
+  input: [[151, 120, 182, 82, 182, 82, 174, 75, 262, 51, 37, 186, 38, 233]]
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_to_speech_vocoder/pwgan_csmsc.yaml b/paddlex/configs/modules/text_to_speech_vocoder/pwgan_csmsc.yaml
new file mode 100644
index 0000000000..d5dcc3c85e
--- /dev/null
+++ b/paddlex/configs/modules/text_to_speech_vocoder/pwgan_csmsc.yaml
@@ -0,0 +1,12 @@
+Global:
+  model: pwgan_csmsc
+  mode: predict # only support predict
+  device: gpu:0
+  output: "output"
+
+Predict:
+  batch_size: 1
+  model_dir: "pwgan_csmsc"
+  input: "https://paddlespeech.bj.bcebos.com/demos/paddlex/mel.npy"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/pipelines/PP-StructureV3.yaml b/paddlex/configs/pipelines/PP-StructureV3.yaml
index a93952771a..62278e988a 100644
--- a/paddlex/configs/pipelines/PP-StructureV3.yaml
+++ b/paddlex/configs/pipelines/PP-StructureV3.yaml
@@ -10,6 +10,14 @@ use_formula_recognition: True
 use_chart_recognition: False
 use_region_detection: True
 format_block_content: False
+markdown_ignore_labels:
+  - number
+  - footnote
+  - header
+  - header_image
+  - footer
+  - footer_image
+  - aside_text
 
 SubModules:
   LayoutDetection:
diff --git a/paddlex/configs/pipelines/PaddleOCR-VL-1.5.yaml b/paddlex/configs/pipelines/PaddleOCR-VL-1.5.yaml
new file mode 100644
index 0000000000..17aca18f1d
--- /dev/null
+++ b/paddlex/configs/pipelines/PaddleOCR-VL-1.5.yaml
@@ -0,0 +1,81 @@
+
+pipeline_name: PaddleOCR-VL-1.5
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: False
+use_layout_detection: True
+use_chart_recognition: False
+use_seal_recognition: False
+format_block_content: False
+merge_layout_blocks: True
+markdown_ignore_labels:
+  - number
+  - footnote
+  - header
+  - header_image
+  - footer
+  - footer_image
+  - aside_text
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayoutV3
+    model_dir: null
+    batch_size: 8
+    threshold: 0.3
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-1.5-0.9B
+    model_dir: null
+    batch_size: 4096
+    genai_config:
+      backend: native
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
diff --git a/paddlex/configs/pipelines/PaddleOCR-VL.yaml b/paddlex/configs/pipelines/PaddleOCR-VL.yaml
index 280a454604..fdb52c7ede 100644
--- a/paddlex/configs/pipelines/PaddleOCR-VL.yaml
+++ b/paddlex/configs/pipelines/PaddleOCR-VL.yaml
@@ -9,6 +9,15 @@ use_doc_preprocessor: False
 use_layout_detection: True
 use_chart_recognition: False
 format_block_content: False
+merge_layout_blocks: True
+markdown_ignore_labels:
+  - number
+  - footnote
+  - header
+  - header_image
+  - footer
+  - footer_image
+  - aside_text
 
 SubModules:
   LayoutDetection:
diff --git a/paddlex/configs/pipelines/text_to_speech.yaml b/paddlex/configs/pipelines/text_to_speech.yaml
new file mode 100644
index 0000000000..f3a5299264
--- /dev/null
+++ b/paddlex/configs/pipelines/text_to_speech.yaml
@@ -0,0 +1,33 @@
+pipeline_name: text_to_speech
+
+SubModules:
+  TextToPinyin:
+    module_name: text_to_pinyin
+    model_name: G2PWModel
+    model_dir: null
+    batch_size: 1   
+    device: gpu:0
+    use_trt: False
+    use_mkldnn: False
+    cpu_threads: 1
+    precision: "fp32"
+  TextToSpeechAcoustic:
+    module_name: text_to_speech_acoustic
+    model_name: fastspeech2_csmsc
+    model_dir: null
+    batch_size: 1   
+    device: gpu:0
+    use_trt: False
+    use_mkldnn: False
+    cpu_threads: 1
+    precision: "fp32"
+  TextToSpeechVocoder:
+    module_name: text_to_speech_vocoder
+    model_name: pwgan_csmsc
+    model_dir: null
+    batch_size: 1   
+    device: gpu:0
+    use_trt: False
+    use_mkldnn: False
+    cpu_threads: 1
+    precision: "fp32"
diff --git a/paddlex/inference/common/batch_sampler/__init__.py b/paddlex/inference/common/batch_sampler/__init__.py
index 4a1ff0b680..ce82686689 100644
--- a/paddlex/inference/common/batch_sampler/__init__.py
+++ b/paddlex/inference/common/batch_sampler/__init__.py
@@ -18,5 +18,6 @@
 from .doc_vlm_batch_sampler import DocVLMBatchSampler
 from .image_batch_sampler import ImageBatchSampler
 from .markdown_batch_sampler import MarkDownBatchSampler
+from .text_batch_sampler import TextBatchSampler
 from .ts_batch_sampler import TSBatchSampler
 from .video_batch_sampler import VideoBatchSampler
diff --git a/paddlex/inference/common/batch_sampler/image_batch_sampler.py b/paddlex/inference/common/batch_sampler/image_batch_sampler.py
index 3fc43534b7..dd78354fe7 100644
--- a/paddlex/inference/common/batch_sampler/image_batch_sampler.py
+++ b/paddlex/inference/common/batch_sampler/image_batch_sampler.py
@@ -28,14 +28,17 @@ class ImgBatch(Batch):
     def __init__(self):
         super().__init__()
         self.page_indexes = []
+        self.page_counts = []
 
-    def append(self, instance, input_path, page_index):
+    def append(self, instance, input_path, page_index, page_count):
         super().append(instance, input_path)
         self.page_indexes.append(page_index)
+        self.page_counts.append(page_count)
 
     def reset(self):
         super().reset()
         self.page_indexes = []
+        self.page_counts = []
 
 
 class ImageBatchSampler(BaseBatchSampler):
@@ -81,7 +84,7 @@ def sample(self, inputs):
         batch = ImgBatch()
         for input in inputs:
             if isinstance(input, np.ndarray):
-                batch.append(input, None, None)
+                batch.append(input, None, None, None)
                 if len(batch) == self.batch_size:
                     yield batch
                     batch = ImgBatch()
@@ -93,10 +96,10 @@ def sample(self, inputs):
                         if input.startswith("http")
                         else input
                     )
-                    for page_idx, page_img in enumerate(
-                        self.pdf_reader.read(file_path)
-                    ):
-                        batch.append(page_img, file_path, page_idx)
+                    doc = self.pdf_reader.load(file_path)
+                    page_count = len(doc)
+                    for page_idx, page_img in enumerate(self.pdf_reader.read(doc)):
+                        batch.append(page_img, file_path, page_idx, page_count)
                         if len(batch) == self.batch_size:
                             yield batch
                             batch = ImgBatch()
@@ -106,7 +109,7 @@ def sample(self, inputs):
                         if input.startswith("http")
                         else input
                     )
-                    batch.append(file_path, file_path, None)
+                    batch.append(file_path, file_path, None, None)
                     if len(batch) == self.batch_size:
                         yield batch
                         batch = ImgBatch()
diff --git a/paddlex/inference/common/batch_sampler/text_batch_sampler.py b/paddlex/inference/common/batch_sampler/text_batch_sampler.py
new file mode 100644
index 0000000000..b36ed2e66a
--- /dev/null
+++ b/paddlex/inference/common/batch_sampler/text_batch_sampler.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....utils import logging
+from .base_batch_sampler import BaseBatchSampler
+
+
+class TextBatchSampler(BaseBatchSampler):
+    def __init__(self):
+        """Initializes the BaseBatchSampler.
+
+        Args:
+            batch_size (int, optional): The size of each batch. Only support 1.
+        """
+        super().__init__()
+        self.batch_size = 1
+
+    def sample(self, inputs):
+        """Generate list of input file path.
+
+        Args:
+            inputs (str): file path.
+
+        Yields:
+            list: list of file path.
+        """
+        if isinstance(inputs, str):
+            yield [inputs]
+        else:
+            logging.warning(
+                f"Not supported input data type! Only `str` are supported, but got: {input}."
+            )
+
+    @BaseBatchSampler.batch_size.setter
+    def batch_size(self, batch_size):
+        """Sets the batch size.
+
+        Args:
+            batch_size (int): The batch size to set.
+
+        Raises:
+            Warning: If the batch size is not equal 1.
+        """
+        # only support batch size 1
+        if batch_size != 1:
+            logging.warning(
+                f"audio batch sampler only support batch size 1, but got {batch_size}."
+            )
+        else:
+            self._batch_size = batch_size
diff --git a/paddlex/inference/common/result/__init__.py b/paddlex/inference/common/result/__init__.py
index 2a5a2521d1..273b919fbd 100644
--- a/paddlex/inference/common/result/__init__.py
+++ b/paddlex/inference/common/result/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .base_audio_result import BaseAudioResult
 from .base_cv_result import BaseCVResult
 from .base_result import BaseResult
 from .base_ts_result import BaseTSResult
diff --git a/paddlex/inference/common/result/base_audio_result.py b/paddlex/inference/common/result/base_audio_result.py
new file mode 100644
index 0000000000..71db8e7d43
--- /dev/null
+++ b/paddlex/inference/common/result/base_audio_result.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_result import BaseResult
+from .mixin import AudioMixin
+
+
+class BaseAudioResult(BaseResult, AudioMixin):
+    """Base class for computer vision results."""
+
+    INPUT_AUDIO_KEY = "input_audio"
+
+    def __init__(self, data: dict) -> None:
+        """
+        Initialize the BaseAudioResult.
+
+        Args:
+            data (dict): The initial data.
+
+        Raises:
+            AssertionError: If the required key (`BaseAudioResult.INPUT_AUDIO_KEY`) are not found in the data.
+        """
+
+        super().__init__(data)
+        AudioMixin.__init__(self, "wav")
diff --git a/paddlex/inference/common/result/base_cv_result.py b/paddlex/inference/common/result/base_cv_result.py
index 189e708fb3..13cbadd7ad 100644
--- a/paddlex/inference/common/result/base_cv_result.py
+++ b/paddlex/inference/common/result/base_cv_result.py
@@ -36,5 +36,7 @@ def _get_input_fn(self):
         if (page_idx := self.get("page_index", None)) is not None:
             fp = Path(fn)
             stem, suffix = fp.stem, fp.suffix
+            if isinstance(page_idx, list):
+                page_idx = f"{page_idx[0]}-{page_idx[-1]}"
             fn = f"{stem}_{page_idx}{suffix}"
         return fn
diff --git a/paddlex/inference/common/result/base_result.py b/paddlex/inference/common/result/base_result.py
index 66c62e7164..3c119f39f8 100644
--- a/paddlex/inference/common/result/base_result.py
+++ b/paddlex/inference/common/result/base_result.py
@@ -15,6 +15,8 @@
 import inspect
 import random
 import time
+import weakref
+from collections import UserList
 from pathlib import Path
 
 import numpy as np
@@ -23,6 +25,53 @@
 from .mixin import JsonMixin, StrMixin
 
 
+class CopyableWeakMethod(weakref.WeakMethod):
+    """
+    A weak method that can be deep copied.
+    """
+
+    def __copy__(self):
+        return self
+
+    def __deepcopy__(self, memo):
+        return self.__copy__()
+
+
+class AutoWeakList(UserList):
+    """
+    A list that automatically removes weak references to items.
+    """
+
+    def append(self, item):
+        """
+        Append item to list.
+        If item is a bound method, append a weak reference to the method.
+        Otherwise, append the item itself.
+        """
+        if inspect.ismethod(item):
+            super().append(CopyableWeakMethod(item))
+        else:
+            super().append(item)
+
+    def __iter__(self):
+        """Iterate over items in the list."""
+        for item in self.data:
+            if isinstance(item, CopyableWeakMethod):
+                func = item()
+                if func is not None:
+                    yield func
+            else:
+                yield item
+
+    def __getitem__(self, index):
+        """Get item at index."""
+        item = super().__getitem__(index)
+        if isinstance(item, CopyableWeakMethod):
+            func = item()
+            return func
+        return item
+
+
 class BaseResult(dict, JsonMixin, StrMixin):
     """Base class for result objects that can save themselves.
 
@@ -36,7 +85,7 @@ def __init__(self, data: dict) -> None:
             data (dict): The initial data.
         """
         super().__init__(data)
-        self._save_funcs = []
+        self._save_funcs = AutoWeakList()
         StrMixin.__init__(self)
         JsonMixin.__init__(self)
         np.set_printoptions(threshold=1, edgeitems=1)
@@ -68,5 +117,9 @@ def _get_input_fn(self):
             )
             self._rand_fn = Path(fp).name
             return self._rand_fn
-        fp = self["input_path"]
+        if isinstance(self["input_path"], list):
+            input_path = self["input_path"][0]
+        else:
+            input_path = self["input_path"]
+        fp = input_path
         return Path(fp).name
diff --git a/paddlex/inference/common/result/mixin.py b/paddlex/inference/common/result/mixin.py
index 846bf645b3..8c503c4e05 100644
--- a/paddlex/inference/common/result/mixin.py
+++ b/paddlex/inference/common/result/mixin.py
@@ -28,6 +28,7 @@
 
 from ....utils import logging
 from ...utils.io import (
+    AudioWriter,
     CSVWriter,
     HtmlWriter,
     ImageWriter,
@@ -546,8 +547,10 @@ def _format_data(obj):
     Returns:
         Any: The formatted object.
     """
-    if isinstance(obj, np.float32):
+    if isinstance(obj, (np.float32, np.float64)):
         return float(obj)
+    elif isinstance(obj, (np.int32, np.int64)):
+        return int(obj)
     elif isinstance(obj, np.ndarray):
         return [_format_data(item) for item in obj.tolist()]
     elif isinstance(obj, pd.DataFrame):
@@ -611,6 +614,7 @@ def _is_json_file(file_path):
             return mime_type is not None and mime_type == "application/json"
 
         json_data = self._to_json()
+
         if not _is_json_file(save_path):
             fn = Path(self._get_input_fn())
             stem = fn.stem
@@ -630,6 +634,7 @@ def _is_json_file(file_path):
                 logging.warning(
                     f"The result has multiple json files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
+
             self._json_writer.write(
                 save_path,
                 json_data[list(json_data.keys())[0]],
@@ -1061,6 +1066,72 @@ def _is_video_file(file_path):
             video_writer.write(save_path, video[list(video.keys())[0]], *args, **kwargs)
 
 
+class AudioMixin:
+    """Mixin class for adding Audio handling capabilities."""
+
+    def __init__(self, backend, *args: List, **kwargs: Dict) -> None:
+        """Initializes AudioMixin.
+
+        Args:
+            *args: Additional positional arguments to pass to the AudioWriter.
+            **kwargs: Additional keyword arguments to pass to the AudioWriter.
+        """
+        self._backend = backend
+        self._save_funcs.append(self.save_to_audio)
+        self._audio_writer = AudioWriter(backend=self._backend, *args, **kwargs)
+
+    @abstractmethod
+    def _to_audio(self) -> Dict[str, np.array]:
+        """Abstract method to convert the result to a audio.
+
+        Returns:
+            Dict[str, np.array]: The audio representation result.
+        """
+        raise NotImplementedError
+
+    @property
+    def audio(self) -> Dict[str, np.array]:
+        """Property to get the audio representation of the result.
+
+        Returns:
+            Dict[str, np.array]: The audio representation of the result.
+        """
+        return self._to_audio()
+
+    def save_to_audio(self, save_path: str, *args: List, **kwargs: Dict) -> None:
+        """Saves the audio representation of the result to the specified path.
+
+        Args:
+            save_path (str): The path to save the audio. If the save path does not end with .mp4 or .avi, it appends the input path's stem and suffix to the save path.
+            *args: Additional positional arguments that will be passed to the audio writer.
+            **kwargs: Additional keyword arguments that will be passed to the audio writer.
+        """
+
+        def _is_audio_file(file_path):
+            mime_type, _ = mimetypes.guess_type(file_path)
+            return mime_type is not None and mime_type.startswith("audio/")
+
+        audio = self._to_audio()
+        if not _is_audio_file(save_path):
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
+            suffix = fn.suffix if _is_audio_file(fn) else ".wav"
+            base_save_path = Path(save_path)
+            for key in audio:
+                save_path = base_save_path / f"{stem}_{key}{suffix}"
+                self._audio_writer.write(
+                    save_path.as_posix(), audio[key], *args, **kwargs
+                )
+        else:
+            if len(audio) > 1:
+                logging.warning(
+                    f"The result has multiple audio files need to be saved. But the `save_path` has been specified as `{save_path}`!"
+                )
+            self._audio_writer.write(
+                save_path, audio[list(audio.keys())[0]], *args, **kwargs
+            )
+
+
 class MarkdownMixin:
     """Mixin class for adding Markdown handling capabilities."""
 
@@ -1174,9 +1245,10 @@ def _save_data(
             if isinstance(value, dict):
                 base_save_path = save_path.parent
                 for img_path, img_data in value.items():
-                    save_img_func(
-                        (base_save_path / img_path).as_posix(),
-                        img_data,
-                        *args,
-                        **kwargs,
-                    )
+                    if img_data:
+                        save_img_func(
+                            (base_save_path / img_path).as_posix(),
+                            img_data,
+                            *args,
+                            **kwargs,
+                        )
diff --git a/paddlex/inference/genai/backends/sglang.py b/paddlex/inference/genai/backends/sglang.py
index 5b28b7831a..2e3c2074ff 100644
--- a/paddlex/inference/genai/backends/sglang.py
+++ b/paddlex/inference/genai/backends/sglang.py
@@ -47,7 +47,7 @@ def run_sglang_server(host, port, model_name, model_dir, config, chat_template_p
         set_config_defaults,
         update_backend_config,
     )
-    from paddlex.inference.genai.models import get_model_components
+    from paddlex.inference.genai.models import get_model_components, is_integrated_model_available
     from sglang.srt.configs.model_config import multimodal_model_archs
     from sglang.srt.entrypoints.http_server import launch_server
     from sglang.srt.managers.multimodal_processor import PROCESSOR_MAPPING
@@ -64,11 +64,12 @@ def run_sglang_server(host, port, model_name, model_dir, config, chat_template_p
     config = data["config"]
     chat_template_path = data["chat_template_path"]
 
-    network_class, processor_class = get_model_components(model_name, "sglang")
+    if not is_integrated_model_available(model_name, "sglang"):
+        network_class, processor_class = get_model_components(model_name, "sglang")
 
-    ModelRegistry.models[network_class.__name__] = network_class
-    multimodal_model_archs.append(network_class.__name__)
-    PROCESSOR_MAPPING[network_class] = processor_class
+        ModelRegistry.models[network_class.__name__] = network_class
+        multimodal_model_archs.append(network_class.__name__)
+        PROCESSOR_MAPPING[network_class] = processor_class
 
     set_config_defaults(config, {{"served-model-name": model_name}})
 
diff --git a/paddlex/inference/genai/backends/vllm.py b/paddlex/inference/genai/backends/vllm.py
index 35a1b77eaa..a102b59785 100644
--- a/paddlex/inference/genai/backends/vllm.py
+++ b/paddlex/inference/genai/backends/vllm.py
@@ -12,21 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ....utils.deps import is_genai_engine_plugin_available, require_genai_engine_plugin
+from ....utils import logging
+from ....utils.deps import (
+    get_dep_version,
+    is_genai_engine_plugin_available,
+    require_genai_engine_plugin,
+)
 from ..configs.utils import (
     backend_config_to_args,
     set_config_defaults,
     update_backend_config,
 )
-from ..models import ALL_MODEL_NAMES, get_model_components
+from ..models import ALL_MODEL_INFO, get_model_components, is_integrated_model_available
 
 
 def register_models():
     from vllm import ModelRegistry
 
+    vllm_version = get_dep_version("vllm")
+
     if is_genai_engine_plugin_available("vllm-server"):
-        for model_name in ALL_MODEL_NAMES:
-            if model_name not in ModelRegistry.get_supported_archs():
+        for model_name in ALL_MODEL_INFO:
+            if (
+                not is_integrated_model_available(model_name, "vllm")
+                and model_name not in ModelRegistry.get_supported_archs()
+            ):
                 net_cls, _ = get_model_components(model_name, "vllm")
                 ModelRegistry.register_model(net_cls.__name__, net_cls)
 
@@ -61,6 +71,16 @@ def run_vllm_server(host, port, model_name, model_dir, config, chat_template_pat
         },
     )
 
+    import torch
+
+    if torch.version.hip is not None and torch.version.cuda is None:
+        # For DCU
+        if "api-server-count" in config:
+            logging.warning(
+                "Key 'api-server-count' will be popped as it is not supported"
+            )
+            config.pop("api-server-count")
+
     args = backend_config_to_args(config)
     args = parser.parse_args(args)
     validate_parsed_serve_args(args)
diff --git a/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja b/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja
index 3583ca3e6e..7b55077ebf 100644
--- a/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja
+++ b/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja
@@ -7,16 +7,13 @@
 {%- if not eos_token is defined -%}
     {%- set eos_token = "</s>" -%}
 {%- endif -%}
-{%- if not image_token is defined -%}
-    {%- set image_token = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" -%}
-{%- endif -%}
 {{- cls_token -}}
 {%- for message in messages -%}
     {%- if message["role"] == "user" -%}
         {{- "User: " -}}
         {%- for content in message["content"] -%}
             {%- if content["type"] == "image" -%}
-                {{ image_token }}
+                {{ "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" }}
             {%- endif -%}
         {%- endfor -%}
         {%- for content in message["content"] -%}
diff --git a/paddlex/inference/genai/chat_templates/PaddleOCR-VL-1.5-0.9B.jinja b/paddlex/inference/genai/chat_templates/PaddleOCR-VL-1.5-0.9B.jinja
new file mode 100644
index 0000000000..d8b8c271d7
--- /dev/null
+++ b/paddlex/inference/genai/chat_templates/PaddleOCR-VL-1.5-0.9B.jinja
@@ -0,0 +1,43 @@
+{%- if not add_generation_prompt is defined -%}
+    {%- set add_generation_prompt = true -%}
+{%- endif -%}
+{%- if not cls_token is defined -%}
+    {%- set cls_token = "<|begin_of_sentence|>" -%}
+{%- endif -%}
+{%- if not eos_token is defined -%}
+    {%- set eos_token = "</s>" -%}
+{%- endif -%}
+{{- cls_token -}}
+{%- for message in messages -%}
+    {%- if message["role"] == "user" -%}
+        {{- "User: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "image" -%}
+                {{ "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" }}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ "\n" -}}
+    {%- elif message["role"] == "assistant" -%}
+        {{- "Assistant:\n" -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ eos_token -}}
+    {%- elif message["role"] == "system" -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] + "\n" }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- "Assistant:\n" -}}
+{%- endif -%}
diff --git a/paddlex/inference/genai/configs/paddleocr_vl_09b.py b/paddlex/inference/genai/configs/paddleocr_vl_09b.py
index cee4c76059..b403617e29 100644
--- a/paddlex/inference/genai/configs/paddleocr_vl_09b.py
+++ b/paddlex/inference/genai/configs/paddleocr_vl_09b.py
@@ -13,16 +13,35 @@
 # limitations under the License.
 
 
+from ....utils.deps import require_deps
+
+__all__ = ["get_config"]
+
+# TODO: Allow setting `trust-remote-code` to `False` to use `transformers` processors
+
+
 def get_config(backend):
     if backend == "fastdeploy":
-        return {
+        require_deps("paddlepaddle")
+
+        import paddle.device
+
+        cfg = {
             "gpu-memory-utilization": 0.7,
             "max-model-len": 16384,
             "max-num-batched-tokens": 16384,
             "max-num-seqs": 256,
-            "workers": 2,
-            "graph-optimization-config": '{"graph_opt_level":0, "use_cudagraph":true}',
+            "workers": 4,
         }
+        if paddle.device.is_compiled_with_cuda():
+            cfg["graph-optimization-config"] = (
+                '{"graph_opt_level":0, "use_cudagraph":true}'
+            )
+        elif paddle.device.is_compiled_with_custom_device("iluvatar_gpu"):
+            cfg["block-size"] = 16
+            cfg["max-num-seqs"] = 32
+            cfg["max-concurrency"] = 2048
+        return cfg
     elif backend == "vllm":
         return {
             "trust-remote-code": True,
diff --git a/paddlex/inference/genai/configs/paddleocr_vl_15_09b.py b/paddlex/inference/genai/configs/paddleocr_vl_15_09b.py
new file mode 100644
index 0000000000..4b2ef3c104
--- /dev/null
+++ b/paddlex/inference/genai/configs/paddleocr_vl_15_09b.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .paddleocr_vl_09b import get_config
+
+__all__ = ["get_config"]
diff --git a/paddlex/inference/genai/configs/utils.py b/paddlex/inference/genai/configs/utils.py
index 9df90aa91a..7f551759b1 100644
--- a/paddlex/inference/genai/configs/utils.py
+++ b/paddlex/inference/genai/configs/utils.py
@@ -32,10 +32,12 @@ def set_config_defaults(config, defaults):
             config[k] = v
 
 
-def backend_config_to_args(config):
+def backend_config_to_args(config, convert_underscores_to_dashes=True):
     # Limited support
     args = []
     for k, v in config.items():
+        if convert_underscores_to_dashes:
+            k = k.replace("_", "-")
         opt = "--" + k
         args.append(opt)
         if not isinstance(v, bool):
diff --git a/paddlex/inference/genai/models/__init__.py b/paddlex/inference/genai/models/__init__.py
index 93b194c364..232e530bea 100644
--- a/paddlex/inference/genai/models/__init__.py
+++ b/paddlex/inference/genai/models/__init__.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from typing import Any, Dict, Optional, Type
 
+from packaging.version import Version
 from pydantic import BaseModel
 
 from ....utils import logging
@@ -29,16 +30,48 @@
 CHAT_TEMPLATE_PATH_GETTER_KEY = "get_chat_template_path"
 DEFAULT_CHAT_TEMPLATE_FILENAME = "chat_template.jinja"
 
-ALL_MODEL_NAMES = {"PaddleOCR-VL-0.9B"}
+ALL_MODEL_INFO = {
+    "PaddleOCR-VL-0.9B": {
+        "min_vllm_version": "0.11.1",
+        "min_sglang_version": "0.5.7",
+    },
+    "PaddleOCR-VL-1.5-0.9B": {
+        "min_vllm_version": "0.11.1",
+        "min_sglang_version": "0.5.7",
+    },
+}
 
 
 def _check_model_name_and_backend(model_name, backend):
-    if model_name not in ALL_MODEL_NAMES:
+    if model_name not in ALL_MODEL_INFO:
         raise ValueError(f"Unknown model: {model_name}")
 
     check_backend(backend)
 
 
+def is_integrated_model_available(model_name, backend):
+    _check_model_name_and_backend(model_name, backend)
+
+    model_info = ALL_MODEL_INFO[model_name]
+
+    if f"min_{backend}_version" in model_info:
+        if backend == "vllm":
+            import vllm
+
+            backend_lib_version = vllm.__version__
+        elif backend == "sglang":
+            import sglang
+
+            backend_lib_version = sglang.__version__
+        else:
+            backend_lib_version = "0.0.0"
+        return Version(backend_lib_version) >= Version(
+            model_info[f"min_{backend}_version"]
+        )
+
+    return False
+
+
 def get_model_dir(model_name, backend):
     _check_model_name_and_backend(model_name, backend)
 
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py b/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py
index 266924f2be..d2afcfdaf1 100644
--- a/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__all__ = ["get_network_class", "get_processor_class"]
+
 
 def get_network_class(backend):
     if backend == "vllm":
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py b/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py
index b42c395df9..a570b4d979 100644
--- a/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py
@@ -21,9 +21,7 @@
 
 from .....utils.deps import is_dep_available
 
-if all(
-    map(is_dep_available, ("einops", "torch", "transformers", "vllm", "flash-attn"))
-):
+if all(map(is_dep_available, ("einops", "torch", "transformers", "vllm"))):
     import torch
     import torch.nn as nn
     from einops import rearrange, repeat
diff --git a/paddlex/inference/genai/models/paddleocr_vl_15_09b/__init__.py b/paddlex/inference/genai/models/paddleocr_vl_15_09b/__init__.py
new file mode 100644
index 0000000000..ccc42c169e
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl_15_09b/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..paddleocr_vl_09b import get_network_class, get_processor_class
+
+__all__ = ["get_network_class", "get_processor_class"]
diff --git a/paddlex/inference/models/__init__.py b/paddlex/inference/models/__init__.py
index 449edaeb95..154cef49e8 100644
--- a/paddlex/inference/models/__init__.py
+++ b/paddlex/inference/models/__init__.py
@@ -20,9 +20,6 @@
 from ...utils import errors
 from ..utils.hpi import HPIConfig
 from ..utils.official_models import official_models
-
-# from .table_recognition import TablePredictor
-# from .general_recognition import ShiTuRecPredictor
 from .anomaly_detection import UadPredictor
 from .base import BasePredictor
 from .common.genai import GenAIConfig, need_local_model
@@ -35,9 +32,8 @@
 from .image_unwarping import WarpPredictor
 from .instance_segmentation import InstanceSegPredictor
 from .keypoint_detection import KptPredictor
+from .layout_analysis import LayoutAnalysisPredictor
 from .m_3d_bev_detection import BEVDet3DPredictor
-
-# from .face_recognition import FaceRecPredictor
 from .multilingual_speech_recognition import WhisperPredictor
 from .object_detection import DetPredictor
 from .open_vocabulary_detection import OVDetPredictor
@@ -46,6 +42,9 @@
 from .table_structure_recognition import TablePredictor
 from .text_detection import TextDetPredictor
 from .text_recognition import TextRecPredictor
+from .text_to_pinyin import TextToPinyinPredictor
+from .text_to_speech_acoustic import Fastspeech2Predictor
+from .text_to_speech_vocoder import PwganPredictor
 from .ts_anomaly_detection import TSAdPredictor
 from .ts_classification import TSClsPredictor
 from .ts_forecasting import TSFcPredictor
diff --git a/paddlex/inference/models/base/predictor/base_predictor.py b/paddlex/inference/models/base/predictor/base_predictor.py
index 8f08d05bc6..139661fa2f 100644
--- a/paddlex/inference/models/base/predictor/base_predictor.py
+++ b/paddlex/inference/models/base/predictor/base_predictor.py
@@ -35,6 +35,7 @@
 from ....utils.benchmark import ENTRY_POINT_NAME, benchmark
 from ....utils.hpi import HPIConfig, HPIInfo
 from ....utils.io import YAMLReader
+from ....utils.model_paths import get_model_paths
 from ....utils.pp_option import PaddlePredictorOption
 from ...common import HPInfer, PaddleInfer
 from ...common.genai import GenAIClient, GenAIConfig, need_local_model
@@ -132,12 +133,13 @@ def __init__(
             self.config = config
             self._genai_config = genai_config
             assert genai_config.server_url is not None
+            client_kwargs = {"model_name": model_name}
+            client_kwargs.update(genai_config.client_kwargs or {})
             self._genai_client = GenAIClient(
                 backend=genai_config.backend,
                 base_url=genai_config.server_url,
                 max_concurrency=genai_config.max_concurrency,
-                model_name=model_name,
-                **(genai_config.client_kwargs or {}),
+                **client_kwargs,
             )
             self._use_local_model = False
 
@@ -157,13 +159,20 @@ def __init__(
 
         if self._use_local_model:
             self._use_hpip = use_hpip
-            if not use_hpip:
-                self._pp_option = self._prepare_pp_option(pp_option, device)
+            model_paths = get_model_paths(self.model_dir, self.MODEL_FILE_PREFIX)
+            if "paddle_dyn" in model_paths or "safetensors" in model_paths:
+                self._use_static_model = False
             else:
-                require_hpip()
-                self._hpi_config = self._prepare_hpi_config(hpi_config, device)
+                self._use_static_model = True
+            if self._use_static_model:
+                if not use_hpip:
+                    self._pp_option = self._prepare_pp_option(pp_option, device)
+                else:
+                    require_hpip()
+                    self._hpi_config = self._prepare_hpi_config(hpi_config, device)
         else:
             self._use_hpip = False
+            self._use_static_model = False
 
         logging.debug(f"{self.__class__.__name__}: {self.model_dir}")
 
diff --git a/paddlex/inference/models/common/static_infer.py b/paddlex/inference/models/common/static_infer.py
index 9009cdad59..b68375a1f4 100644
--- a/paddlex/inference/models/common/static_infer.py
+++ b/paddlex/inference/models/common/static_infer.py
@@ -29,7 +29,7 @@
     DISABLE_TRT_MODEL_BL,
     USE_PIR_TRT,
 )
-from ...utils.benchmark import benchmark, set_inference_operations
+from ...utils.benchmark import add_inference_operations, benchmark
 from ...utils.hpi import (
     HPIConfig,
     OMConfig,
@@ -50,7 +50,7 @@
     "PaddleInferChainLegacy",
     "MultiBackendInfer",
 ]
-set_inference_operations(INFERENCE_OPERATIONS)
+add_inference_operations(*INFERENCE_OPERATIONS)
 
 
 # XXX: Better use Paddle Inference API to do this
@@ -91,6 +91,7 @@ def _collect_trt_shape_range_info(
     config.collect_shape_range_info(shape_range_info_path)
     # TODO: Add other needed options
     config.disable_glog_info()
+    config.delete_pass("matmul_add_act_fuse_pass")
     predictor = paddle.inference.create_predictor(config)
 
     input_names = predictor.get_input_names()
@@ -358,7 +359,8 @@ def _create(
             logging.debug("`device_id` has been set to None")
 
         if (
-            self._option.device_type in ("gpu", "dcu", "npu", "mlu", "gcu", "xpu", "iluvatar_gpu")
+            self._option.device_type
+            in ("gpu", "dcu", "npu", "mlu", "gcu", "xpu", "iluvatar_gpu", "metax_gpu")
             and self._option.device_id is None
         ):
             self._option.device_id = 0
@@ -399,6 +401,11 @@ def _create(
                 if hasattr(config, "enable_new_executor"):
                     config.enable_new_executor()
                 config.set_optimization_level(3)
+                config.delete_pass("matmul_add_act_fuse_pass")
+                # ROCm does not support fused_conv2d_add_act kernel, delete the fuse passes
+                if paddle.is_compiled_with_rocm():
+                    config.delete_pass("conv2d_add_act_fuse_pass")
+                    config.delete_pass("conv2d_add_fuse_pass")
             elif self._option.device_type == "npu":
                 config.enable_custom_device("npu", self._option.device_id)
                 if hasattr(config, "enable_new_ir"):
@@ -420,6 +427,12 @@ def _create(
                     config.enable_new_ir(self._option.enable_new_ir)
                 if hasattr(config, "enable_new_executor"):
                     config.enable_new_executor()
+            elif self._option.device_type == "metax_gpu":
+                config.enable_custom_device("metax_gpu", self._option.device_id)
+                if hasattr(config, "enable_new_ir"):
+                    config.enable_new_ir(self._option.enable_new_ir)
+                if hasattr(config, "enable_new_executor"):
+                    config.enable_new_executor()
             elif self._option.device_type == "gcu":
                 from paddle_custom_device.gcu import passes as gcu_passes
 
@@ -480,6 +493,10 @@ def _create(
         if not DEBUG:
             config.disable_glog_info()
 
+        # ROCm does not support fused_conv2d_add_act kernel
+        config.delete_pass("conv2d_add_act_fuse_pass")
+        config.delete_pass("conv2d_add_fuse_pass")
+
         predictor = paddle.inference.create_predictor(config)
 
         return predictor
diff --git a/paddlex/inference/models/common/tokenizer/tokenizer_utils.py b/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
index a34f3380bb..80d9e73792 100644
--- a/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
+++ b/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
@@ -943,8 +943,6 @@ def init_chat_template(self, chat_template: Union[str, dict]):
             raise ValueError("Receive error chat_template data: ", chat_template)
 
     def save_resources(self, save_directory):
-        super().save_resources(save_directory)
-
         if isinstance(
             self.chat_template, ChatTemplate
         ):  # Future remove if ChatTemplate is deprecated
diff --git a/paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py b/paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py
index 126d44dcff..1dd69c22eb 100644
--- a/paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py
+++ b/paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py
@@ -36,6 +36,8 @@
 import numpy as np
 
 from .....utils import logging
+from .....utils.cache import CACHE_DIR
+from .....utils.download import download
 
 __all__ = [
     "AddedToken",
@@ -1661,7 +1663,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         resolved_vocab_files = {}
         for file_id, file_path in vocab_files.items():
             # adapt to PaddleX
-            resolved_vocab_files[file_id] = file_path
+            if file_path is None or os.path.isfile(file_path):
+                resolved_vocab_files[file_id] = file_path
+                continue
+            else:
+                download_path = os.path.join(
+                    CACHE_DIR, "official_models", pretrained_model_name_or_path, file_id
+                )
+                download(file_path, download_path)
+                resolved_vocab_files[file_id] = download_path
 
         for file_id, file_path in resolved_vocab_files.items():
             if resolved_vocab_files[file_id] is not None:
diff --git a/paddlex/inference/models/common/vlm/__init__.py b/paddlex/inference/models/common/transformers/__init__.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/__init__.py
rename to paddlex/inference/models/common/transformers/__init__.py
diff --git a/paddlex/inference/models/common/vlm/activations.py b/paddlex/inference/models/common/transformers/activations.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/activations.py
rename to paddlex/inference/models/common/transformers/activations.py
diff --git a/paddlex/inference/models/common/vlm/bert_padding.py b/paddlex/inference/models/common/transformers/bert_padding.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/bert_padding.py
rename to paddlex/inference/models/common/transformers/bert_padding.py
diff --git a/paddlex/inference/models/common/vlm/conversion_utils.py b/paddlex/inference/models/common/transformers/conversion_utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/conversion_utils.py
rename to paddlex/inference/models/common/transformers/conversion_utils.py
diff --git a/paddlex/inference/models/common/vlm/distributed.py b/paddlex/inference/models/common/transformers/distributed.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/distributed.py
rename to paddlex/inference/models/common/transformers/distributed.py
diff --git a/paddlex/inference/models/common/vlm/flash_attn_utils.py b/paddlex/inference/models/common/transformers/flash_attn_utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/flash_attn_utils.py
rename to paddlex/inference/models/common/transformers/flash_attn_utils.py
diff --git a/paddlex/inference/models/common/vlm/fusion_ops.py b/paddlex/inference/models/common/transformers/fusion_ops.py
similarity index 98%
rename from paddlex/inference/models/common/vlm/fusion_ops.py
rename to paddlex/inference/models/common/transformers/fusion_ops.py
index b692223956..1a34f5b9fd 100644
--- a/paddlex/inference/models/common/vlm/fusion_ops.py
+++ b/paddlex/inference/models/common/transformers/fusion_ops.py
@@ -47,6 +47,8 @@ def get_env_device():
         return "mlu"
     elif "gcu" in paddle.device.get_all_custom_device_type():
         return "gcu"
+    elif "metax_gpu" in paddle.device.get_all_custom_device_type():
+        return "metax_gpu"
     elif "intel_hpu" in paddle.device.get_all_custom_device_type():
         return "intel_hpu"
     elif "iluvatar_gpu" in paddle.device.get_all_custom_device_type():
diff --git a/paddlex/inference/models/common/vlm/generation/__init__.py b/paddlex/inference/models/common/transformers/generation/__init__.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/generation/__init__.py
rename to paddlex/inference/models/common/transformers/generation/__init__.py
diff --git a/paddlex/inference/models/common/vlm/generation/configuration_utils.py b/paddlex/inference/models/common/transformers/generation/configuration_utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/generation/configuration_utils.py
rename to paddlex/inference/models/common/transformers/generation/configuration_utils.py
diff --git a/paddlex/inference/models/common/vlm/generation/logits_process.py b/paddlex/inference/models/common/transformers/generation/logits_process.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/generation/logits_process.py
rename to paddlex/inference/models/common/transformers/generation/logits_process.py
diff --git a/paddlex/inference/models/common/vlm/generation/stopping_criteria.py b/paddlex/inference/models/common/transformers/generation/stopping_criteria.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/generation/stopping_criteria.py
rename to paddlex/inference/models/common/transformers/generation/stopping_criteria.py
diff --git a/paddlex/inference/models/common/vlm/generation/utils.py b/paddlex/inference/models/common/transformers/generation/utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/generation/utils.py
rename to paddlex/inference/models/common/transformers/generation/utils.py
diff --git a/paddlex/inference/models/common/transformers/transformers/__init__.py b/paddlex/inference/models/common/transformers/transformers/__init__.py
new file mode 100644
index 0000000000..29cfe096eb
--- /dev/null
+++ b/paddlex/inference/models/common/transformers/transformers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_utils import PretrainedConfig
+from .hf_state_dict_utils import BatchNormHFStateDictMixin
+from .model_utils import PretrainedModel
diff --git a/paddlex/inference/models/common/vlm/transformers/configuration_utils.py b/paddlex/inference/models/common/transformers/transformers/configuration_utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/transformers/configuration_utils.py
rename to paddlex/inference/models/common/transformers/transformers/configuration_utils.py
diff --git a/paddlex/inference/models/common/vlm/transformers/conversion_utils.py b/paddlex/inference/models/common/transformers/transformers/conversion_utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/transformers/conversion_utils.py
rename to paddlex/inference/models/common/transformers/transformers/conversion_utils.py
diff --git a/paddlex/inference/models/common/transformers/transformers/hf_state_dict_utils.py b/paddlex/inference/models/common/transformers/transformers/hf_state_dict_utils.py
new file mode 100644
index 0000000000..e293e4361b
--- /dev/null
+++ b/paddlex/inference/models/common/transformers/transformers/hf_state_dict_utils.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class BatchNormHFStateDictMixin:
+    def _get_forward_key_rules(self):
+        return [
+            ("_mean", "_mean", "running_mean"),
+            ("_variance", "_variance", "running_var"),
+        ]
+
+    def _get_reverse_key_rules(self):
+        return [
+            ("running_mean", "running_mean", "_mean"),
+            ("running_var", "running_var", "_variance"),
+        ]
+
+    def get_hf_state_dict(self, *args, **kwargs):
+
+        try:
+            super().get_hf_state_dict(*args, **kwargs)
+        except NotImplementedError:
+            pass
+
+        model_state_dict = self.state_dict(*args, **kwargs)
+        hf_state_dict = {}
+        rules = self._get_forward_key_rules()
+        for old_key, value in model_state_dict.items():
+            new_key = old_key
+            for match_key, old_sub, new_sub in rules:
+                if match_key in old_key:
+                    new_key = old_key.replace(old_sub, new_sub)
+                    break
+            hf_state_dict[new_key] = value
+        return hf_state_dict
+
+    def set_hf_state_dict(self, state_dict, *args, **kwargs):
+
+        try:
+            super().set_hf_state_dict(state_dict, *args, **kwargs)
+        except NotImplementedError:
+            pass
+
+        key_mapping = {}
+        rules = self._get_reverse_key_rules()
+        for old_key in list(state_dict.keys()):
+            for match_key, old_sub, new_sub in rules:
+                if match_key in old_key:
+                    key_mapping[old_key] = old_key.replace(old_sub, new_sub)
+                    break
+        for old_key, new_key in key_mapping.items():
+            state_dict[new_key] = state_dict.pop(old_key)
+        return self.set_state_dict(state_dict, *args, **kwargs)
diff --git a/paddlex/inference/models/common/vlm/transformers/model_outputs.py b/paddlex/inference/models/common/transformers/transformers/model_outputs.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/transformers/model_outputs.py
rename to paddlex/inference/models/common/transformers/transformers/model_outputs.py
diff --git a/paddlex/inference/models/common/vlm/transformers/model_utils.py b/paddlex/inference/models/common/transformers/transformers/model_utils.py
similarity index 98%
rename from paddlex/inference/models/common/vlm/transformers/model_utils.py
rename to paddlex/inference/models/common/transformers/transformers/model_utils.py
index e55877e296..0df07bddc3 100644
--- a/paddlex/inference/models/common/vlm/transformers/model_utils.py
+++ b/paddlex/inference/models/common/transformers/transformers/model_utils.py
@@ -203,11 +203,16 @@ def _transpose_hf_weight(key, weight):
                     else:
                         weight = tp_fn(py_safe_slice_)
                 else:
-                    weight = py_safe_slice_[:]
+                    # HACK
+                    if len(py_safe_slice_.get_shape()) == 0:
+                        logging.debug("Ignore empty shape this moment")
+                    else:
+                        weight = py_safe_slice_[:]
 
                 if not return_numpy and device == "expected":
                     weight = weight._copy_to(
-                        paddle.framework._current_expected_place(), False
+                        paddle.framework._current_expected_place(),
+                        True,
                     )
                 weight = _transpose_hf_weight(key, weight)
                 if return_numpy:
@@ -252,17 +257,18 @@ def load_state_dict(
         from safetensors import safe_open
 
         with safe_open(checkpoint_file, framework="paddle") as f:
-            state_dict, scale_dict = _load_part_state_dict_from_safetensors(
-                list(f.keys()),
-                checkpoint_file,
-                tensor_parallel_split_mapping,
-                fliter_dict_keys,
-                "expected",
-                dtype=None,
-                return_numpy=False,
-                convert_from_hf=convert_from_hf,
-                transpose_weight_keys=transpose_weight_keys,
-            )
+            keys = list(f.keys())
+        state_dict, scale_dict = _load_part_state_dict_from_safetensors(
+            keys,
+            checkpoint_file,
+            tensor_parallel_split_mapping,
+            fliter_dict_keys,
+            "expected",
+            dtype=None,
+            return_numpy=False,
+            convert_from_hf=convert_from_hf,
+            transpose_weight_keys=transpose_weight_keys,
+        )
     else:
         state_dict = paddlenlp_load(checkpoint_file, map_location="cpu")
     return state_dict
@@ -1841,13 +1847,12 @@ def from_pretrained(
             ):
                 raise NotImplementedError
             else:
-                try:
-                    transpose_weight_keys = model.get_transpose_weight_keys()
-                except NotImplementedError:
-                    if convert_from_hf:
-                        raise ValueError("`convert_from_hf=True` is not supported")
-                    else:
-                        transpose_weight_keys = None
+                transpose_weight_keys = None
+                if convert_from_hf:
+                    try:
+                        transpose_weight_keys = model.get_transpose_weight_keys()
+                    except NotImplementedError:
+                        pass
                 state_dict = load_state_dict(
                     resolved_archive_file,
                     convert_from_hf=convert_from_hf,
diff --git a/paddlex/inference/models/common/vlm/transformers/utils.py b/paddlex/inference/models/common/transformers/transformers/utils.py
similarity index 100%
rename from paddlex/inference/models/common/vlm/transformers/utils.py
rename to paddlex/inference/models/common/transformers/transformers/utils.py
diff --git a/paddlex/inference/models/common/vlm/utils.py b/paddlex/inference/models/common/transformers/utils.py
similarity index 98%
rename from paddlex/inference/models/common/vlm/utils.py
rename to paddlex/inference/models/common/transformers/utils.py
index 43cb7e0106..52c2f5b11f 100644
--- a/paddlex/inference/models/common/vlm/utils.py
+++ b/paddlex/inference/models/common/transformers/utils.py
@@ -106,4 +106,6 @@ def get_env_device():
         return "rocm"
     elif paddle.is_compiled_with_xpu():
         return "xpu"
+    elif paddle.is_compiled_with_maca():
+        return "metax_gpu"
     return "cpu"
diff --git a/paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py b/paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py
index 6231fca77a..2c6c3f5b9d 100644
--- a/paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py
+++ b/paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py
@@ -19,7 +19,8 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from ...common.vlm.transformers.model_outputs import CausalLMOutputWithPast
+from ...common.transformers.transformers import BatchNormHFStateDictMixin
+from ...common.transformers.transformers.model_outputs import CausalLMOutputWithPast
 from .qwen2 import Qwen2Config, Qwen2ForCausalLM, Qwen2Model
 
 
@@ -811,7 +812,30 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-class PPChart2TableInference(GOTQwenForCausalLM):
+class PPChart2TableInference(BatchNormHFStateDictMixin, GOTQwenForCausalLM):
+
+    def get_transpose_weight_keys(self):
+        t_layers = [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+            "o_proj",
+            "lm_head",
+            "attn.qkv",
+            "mlp.lin1",
+            "mlp.lin2",
+            "attn.proj",
+            "mm_projector_vary",
+        ]
+        keys = []
+        for key, _ in self.get_hf_state_dict().items():
+            for t_layer in t_layers:
+                if t_layer in key and key.endswith("weight"):
+                    keys.append(key)
+        return keys
 
     def generate(self, inputs, **kwargs):
         max_new_tokens = kwargs.get("max_new_tokens", 1024)
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py
index 28971e626c..678b338d45 100644
--- a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py
@@ -28,7 +28,7 @@
 
 from ......utils.device import parse_device
 from ......utils.env import get_paddle_cuda_version
-from ....common.vlm.transformers import PretrainedConfig
+from ....common.transformers.transformers import PretrainedConfig
 
 
 class PaddleOCRVisionConfig(PretrainedConfig):
@@ -176,6 +176,8 @@ def __init__(
             cuda_version = get_paddle_cuda_version()
             if cuda_version and cuda_version[0] > 11:
                 self.fuse_rms_norm = True
+        if device_type == "metax_gpu":
+            self.fuse_rms_norm = True
         self.use_sparse_flash_attn = True
         self.use_var_len_flash_attn = False
         self.scale_qk_coeff = 1.0
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
index 7e43b3cff0..1d82d68dc2 100644
--- a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
@@ -39,8 +39,8 @@
 from paddle.distributed.fleet.utils import recompute
 
 from ......utils import logging
-from ....common.vlm.transformers import PretrainedModel
-from ....common.vlm.transformers.model_outputs import (
+from ....common.transformers.transformers import PretrainedModel
+from ....common.transformers.transformers.model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
 )
 from ._config import PaddleOCRVLConfig
@@ -296,7 +296,7 @@ def forward(self, hidden_states):
                 3. Scale by learned weight parameter
             - Maintains original dtype for numerical stability during computation
         """
-        if self.config.fuse_rms_norm:
+        if hidden_states.dtype != paddle.float16 and self.config.fuse_rms_norm:
             return fused_rms_norm_ext(
                 hidden_states, self.weight, self.variance_epsilon
             )[0].astype(self.weight.dtype)
@@ -854,8 +854,15 @@ def core_attn(
         v = tensor.transpose(x=v, perm=perm)
 
         replicate = self.config.num_attention_heads // self.config.num_key_value_heads
+        is_float16 = k.dtype == paddle.float16
+        if is_float16:
+            k = k.cast(paddle.float32)
+            v = v.cast(paddle.float32)
         k = paddle.repeat_interleave(k, replicate, axis=1)
         v = paddle.repeat_interleave(v, replicate, axis=1)
+        if is_float16:
+            k = k.cast(paddle.float16)
+            v = v.cast(paddle.float16)
 
         scale_qk_coeff = self.config.scale_qk_coeff * self.head_dim**0.5
         product = paddle.matmul(x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
index b8385c8f75..93b61b6cc4 100644
--- a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
@@ -41,8 +41,7 @@
 import paddle
 import paddle.nn as nn
 
-from ....common.vlm.generation import GenerationMixin
-from ....common.vlm.transformers.model_outputs import (
+from ....common.transformers.transformers.model_outputs import (
     CausalLMOutputWithCrossAttentions,
     ModelOutput,
 )
@@ -62,7 +61,7 @@ class PaddleOCRVLCausalLMOutputWithPast(ModelOutput):
     rope_deltas: Optional[paddle.Tensor] = None
 
 
-class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel, GenerationMixin):
+class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     config_class = PaddleOCRVLConfig
     _no_split_modules = ["Ernie4_5DecoderLayer", "SiglipEncoderLayer"]
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
index a4a3c4a0c1..3b77910ba9 100644
--- a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
@@ -35,6 +35,7 @@
 
 # TODO: Weight initialization
 
+import platform
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -42,9 +43,14 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from ....common.vlm.activations import ACT2FN
-from ....common.vlm.transformers import PretrainedModel
-from ....common.vlm.transformers.model_outputs import (
+from ......utils.env import (
+    get_device_type,
+    get_gpu_compute_capability,
+    get_paddle_cuda_version,
+)
+from ....common.transformers.activations import ACT2FN
+from ....common.transformers.transformers import PretrainedModel
+from ....common.transformers.transformers.model_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
 )
@@ -100,15 +106,22 @@ def eager_attention_forward(
     dropout: float = 0.0,
     **kwargs,
 ):
-    attn_weights = paddle.matmul(query, key.transpose((0, 1, 3, 2))) * scaling
+    origin_dtype = query.dtype
+
+    attn_weights = paddle.matmul(x=query.scale(scaling), y=key, transpose_y=True)
+    attn_weights = attn_weights.cast(paddle.float32)
+
     if attention_mask is not None:
+        attnetion_mask = attention_mask.cast(paddle.float32)
         attn_weights = attn_weights + attention_mask
 
-    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query.dtype)
+    attn_weights = F.softmax(attn_weights, axis=-1)
+    attn_weights = attn_weights.cast(origin_dtype)
+
     attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
 
     attn_output = paddle.matmul(attn_weights, value)
-    attn_output = attn_output.transpose((0, 2, 1, 3)).contiguous()
+    attn_output = attn_output.transpose((0, 2, 1, 3))
 
     return attn_output, attn_weights
 
@@ -130,6 +143,20 @@ def __init__(self, config):
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
 
+        cap = get_gpu_compute_capability()
+        cuda_ver = get_paddle_cuda_version()
+        self._supports_sdpa = False
+        if (
+            cap is not None
+            and cap >= (8, 0)
+            and cuda_ver is not None
+            and cuda_ver >= (11, 4)
+            and platform.system() == "Linux"
+        ):
+            self._supports_sdpa = True
+        if get_device_type() == "iluvatar_gpu":
+            self._supports_sdpa = True
+
     def forward(
         self,
         hidden_states: paddle.Tensor,  # [B, L, D]
@@ -138,6 +165,9 @@ def forward(
         cu_seqlens: Optional[List[paddle.Tensor]] = None,
         rope_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]] = None,  # (cos, sin)
     ):
+        if output_attentions:
+            raise NotImplementedError
+
         B, L, D = hidden_states.shape
 
         q = self.q_proj(hidden_states)
@@ -145,7 +175,6 @@ def forward(
         v = self.v_proj(hidden_states)
 
         # [B, L, H, Dh]
-
         q = q.reshape([B, L, self.num_heads, self.head_dim])
         k = k.reshape([B, L, self.num_heads, self.head_dim])
         v = v.reshape([B, L, self.num_heads, self.head_dim])
@@ -153,29 +182,38 @@ def forward(
             cos, sin = rope_emb
             q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
-        # → [B, H, L, Dh]
-        q = q.transpose([0, 2, 1, 3])
-        k = k.transpose([0, 2, 1, 3])
-        v = v.transpose([0, 2, 1, 3])
-
-        attn_output, attn_weights = eager_attention_forward(
-            self,
-            q,
-            k,
-            v,
-            attention_mask,
-            is_causal=self.is_causal,
-            scaling=self.scale,
-            dropout=0.0 if not self.training else self.dropout,
-        )
-        attn_output = attn_output.reshape([B, L, D]).contiguous()
+        if not self._supports_sdpa or q.dtype == paddle.float32:
+            # → [B, H, L, Dh]
+            q = q.transpose([0, 2, 1, 3])
+            k = k.transpose([0, 2, 1, 3])
+            v = v.transpose([0, 2, 1, 3])
+
+            attn_output, _ = eager_attention_forward(
+                self,
+                q,
+                k,
+                v,
+                attention_mask,
+                is_causal=self.is_causal,
+                scaling=self.scale,
+                dropout=0.0 if not self.training else self.dropout,
+            )
+            attn_output = attn_output.reshape([B, L, D])
+        else:
+            attn_output = paddle.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attention_mask,
+                dropout_p=self.dropout,
+                is_causal=self.is_causal,
+                training=self.training,
+            )
+        attn_output = attn_output.reshape([B, L, D])
 
         attn_output = self.out_proj(attn_output)
 
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights
+        return attn_output, None
 
 
 class SiglipVisionEmbeddings(nn.Layer):
diff --git a/paddlex/inference/models/doc_vlm/modeling/qwen2.py b/paddlex/inference/models/doc_vlm/modeling/qwen2.py
index 60a3a80191..434d380c3d 100644
--- a/paddlex/inference/models/doc_vlm/modeling/qwen2.py
+++ b/paddlex/inference/models/doc_vlm/modeling/qwen2.py
@@ -26,10 +26,10 @@
 
 from .....utils import logging
 from .....utils.env import get_device_type
-from ...common.vlm import fusion_ops
-from ...common.vlm.activations import ACT2FN
-from ...common.vlm.transformers import PretrainedConfig, PretrainedModel
-from ...common.vlm.transformers.model_outputs import (
+from ...common.transformers import fusion_ops
+from ...common.transformers.activations import ACT2FN
+from ...common.transformers.transformers import PretrainedConfig, PretrainedModel
+from ...common.transformers.transformers.model_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
@@ -929,7 +929,7 @@ class Qwen2PretrainedModel(PretrainedModel):
     @classmethod
     def _get_fuse_or_split_param_mappings(cls, config: Qwen2Config, is_fuse=False):
         # return parameter fuse utils
-        from ...common.vlm.conversion_utils import split_or_fuse_func
+        from ...common.transformers.conversion_utils import split_or_fuse_func
 
         fn = split_or_fuse_func(is_fuse=is_fuse)
 
diff --git a/paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py b/paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py
index 1198c89370..74de82c46a 100644
--- a/paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py
+++ b/paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py
@@ -27,11 +27,11 @@
 
 from .....utils import logging
 from .....utils.env import get_device_type
-from ...common.vlm.activations import ACT2FN
-from ...common.vlm.bert_padding import index_first_axis, pad_input, unpad_input
-from ...common.vlm.flash_attn_utils import has_flash_attn_func
-from ...common.vlm.transformers import PretrainedConfig, PretrainedModel
-from ...common.vlm.transformers.model_outputs import (
+from ...common.transformers.activations import ACT2FN
+from ...common.transformers.bert_padding import index_first_axis, pad_input, unpad_input
+from ...common.transformers.flash_attn_utils import has_flash_attn_func
+from ...common.transformers.transformers import PretrainedConfig, PretrainedModel
+from ...common.transformers.transformers.model_outputs import (
     BaseModelOutputWithPast,
     ModelOutput,
 )
diff --git a/paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py b/paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py
index 5fb58f9c61..93d8ebe688 100644
--- a/paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py
+++ b/paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py
@@ -32,11 +32,11 @@
     get_inference_operations,
     set_inference_operations,
 )
-from ...common.vlm.activations import ACT2FN
-from ...common.vlm.bert_padding import index_first_axis, pad_input, unpad_input
-from ...common.vlm.flash_attn_utils import has_flash_attn_func
-from ...common.vlm.transformers import PretrainedConfig, PretrainedModel
-from ...common.vlm.transformers.model_outputs import (
+from ...common.transformers.activations import ACT2FN
+from ...common.transformers.bert_padding import index_first_axis, pad_input, unpad_input
+from ...common.transformers.flash_attn_utils import has_flash_attn_func
+from ...common.transformers.transformers import PretrainedConfig, PretrainedModel
+from ...common.transformers.transformers.model_outputs import (
     BaseModelOutputWithPast,
     ModelOutput,
 )
diff --git a/paddlex/inference/models/doc_vlm/predictor.py b/paddlex/inference/models/doc_vlm/predictor.py
index d2eb1d9a55..c5352fe846 100644
--- a/paddlex/inference/models/doc_vlm/predictor.py
+++ b/paddlex/inference/models/doc_vlm/predictor.py
@@ -17,9 +17,7 @@
 import io
 import os
 import warnings
-from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from threading import Lock
 from typing import List, Optional
 
 import numpy as np
@@ -30,6 +28,7 @@
 from ....utils.device import TemporaryDeviceChanger
 from ...common.batch_sampler import DocVLMBatchSampler
 from ...utils.misc import is_bfloat16_available
+from ...utils.model_paths import get_model_paths
 from ..base import BasePredictor
 from .result import DocVLMResult
 
@@ -41,7 +40,7 @@ class DocVLMPredictor(BasePredictor):
         "PP-DocBee": {"PP-DocBee-2B", "PP-DocBee-7B"},
         "PP-DocBee2": {"PP-DocBee2-3B"},
         "PP-Chart2Table": {"PP-Chart2Table"},
-        "PaddleOCR-VL": {"PaddleOCR-VL-0.9B"},
+        "PaddleOCR-VL": {"PaddleOCR-VL-0.9B", "PaddleOCR-VL-1.5-0.9B"},
     }
 
     def __init__(self, *args, **kwargs):
@@ -52,25 +51,28 @@ def __init__(self, *args, **kwargs):
         """
         super().__init__(*args, **kwargs)
 
+        if self.batch_sampler.batch_size == -1:
+            self.batch_sampler.batch_size = self._determine_batch_size()
+
         if self._use_local_model:
+            if self._use_static_model:
+                raise RuntimeError("Static graph models are not supported")
             self.device = kwargs.get("device", None)
-            self.dtype = "bfloat16" if is_bfloat16_available(self.device) else "float32"
+            if is_bfloat16_available(self.device):
+                self.dtype = "bfloat16"
+            else:
+                self.dtype = "float32"
 
             self.infer, self.processor = self._build(**kwargs)
 
             if (
-                self.model_name == "PaddleOCR-VL-0.9B"
+                self.model_name in self.model_group["PaddleOCR-VL"]
                 and self.batch_sampler.batch_size > 1
             ):
                 logging.warning(
-                    "Currently, the PaddleOCR-VL-0.9B local model only supports batch size of 1. The batch size will be updated to 1."
+                    f"Currently, the {repr(self.model_name)} local model only supports batch size of 1. The batch size will be updated to 1."
                 )
                 self.batch_sampler.batch_size = 1
-        else:
-            if self.batch_sampler.batch_size > 1:
-                self._thread_pool = ThreadPoolExecutor(
-                    max_workers=min(self.batch_sampler.batch_size, os.cpu_count() or 1)
-                )
 
     def _build_batch_sampler(self):
         """Builds and returns an DocVLMBatchSampler instance.
@@ -121,11 +123,23 @@ def _build(self, **kwargs):
                     "The PP-Chart2Table series does not support `use_hpip=True` for now."
                 )
             with TemporaryDeviceChanger(self.device):
-                model = PPChart2TableInference.from_pretrained(
-                    self.model_dir,
-                    dtype=self.dtype,
-                    pad_token_id=processor.tokenizer.eos_token_id,
-                )
+                model_path = get_model_paths(self.model_dir)
+
+                if "safetensors" in model_path:
+                    model = PPChart2TableInference.from_pretrained(
+                        self.model_dir,
+                        dtype=self.dtype,
+                        pad_token_id=processor.tokenizer.eos_token_id,
+                        use_safetensors=True,
+                        convert_from_hf=True,
+                    )
+                else:
+                    model = PPChart2TableInference.from_pretrained(
+                        self.model_dir,
+                        dtype=self.dtype,
+                        pad_token_id=processor.tokenizer.eos_token_id,
+                    )
+
         elif self.model_name in self.model_group["PP-DocBee2"]:
             if kwargs.get("use_hpip", False):
                 warnings.warn(
@@ -152,6 +166,18 @@ def _build(self, **kwargs):
 
         return model, processor
 
+    def _determine_batch_size(self):
+        if self._model_name == "PaddleOCR-VL-0.9B":
+            batch_size = 1
+            if not self._use_local_model:
+                batch_size = 4096
+            logging.debug(
+                f"The batch size of {self._model_name} is determined to be {batch_size}."
+            )
+            return batch_size
+        else:
+            raise RuntimeError(f"Could not determine batch size for {self._model_name}")
+
     def process(
         self,
         data: List[dict],
@@ -182,7 +208,21 @@ def process(
         if self._use_local_model:
             src_data = copy.copy(data)
             # preprocess
-            data = self.processor.preprocess(data)
+            if self.model_name in self.model_group["PaddleOCR-VL"]:
+                data = self.processor.preprocess(
+                    data, min_pixels=min_pixels, max_pixels=max_pixels
+                )
+            else:
+                data = self.processor.preprocess(data)
+                if min_pixels is not None:
+                    warnings.warn(
+                        f"`min_pixels` is currently not supported by the {repr(self.model_name)} model and will be ignored."
+                    )
+                if max_pixels is not None:
+                    warnings.warn(
+                        f"`max_pixels` is currently not supported by the {repr(self.model_name)} model and will be ignored."
+                    )
+
             data = self._switch_inputs_to_device(data)
 
             # do infer
@@ -203,14 +243,6 @@ def process(
                 warnings.warn(
                     "`top_p` is currently not supported by the local model and will be ignored."
                 )
-            if min_pixels is not None:
-                warnings.warn(
-                    "`min_pixels` is currently not supported by the local model and will be ignored."
-                )
-            if max_pixels is not None:
-                warnings.warn(
-                    "`max_pixels` is currently not supported by the local model and will be ignored."
-                )
             if use_cache is not None:
                 generate_kwargs["use_cache"] = use_cache
             with TemporaryDeviceChanger(self.device):
@@ -298,25 +330,26 @@ def build_processor(self, **kwargs):
         else:
             raise NotImplementedError
 
-    def close(self):
-        super().close()
-        if hasattr(self, "_thread_pool"):
-            self._thread_pool.shutdown()
-
     def _format_result_dict(self, model_preds, src_data):
         if not isinstance(model_preds, list):
             model_preds = [model_preds]
         if not isinstance(src_data, list):
             src_data = [src_data]
-        if len(model_preds) != len(src_data):
+        input_info = []
+        for data in src_data:
+            image = data.get("image", None)
+            if isinstance(image, str):
+                data["input_path"] = image
+            input_info.append(data)
+        if len(model_preds) != len(input_info):
             raise ValueError(
-                f"Model predicts {len(model_preds)} results while src data has {len(src_data)} samples."
+                f"Model predicts {len(model_preds)} results while src data has {len(input_info)} samples."
             )
 
-        rst_format_dict = {k: [] for k in src_data[0].keys()}
+        rst_format_dict = {k: [] for k in input_info[0].keys()}
         rst_format_dict["result"] = []
 
-        for data_sample, model_pred in zip(src_data, model_preds):
+        for data_sample, model_pred in zip(input_info, model_preds):
             for k in data_sample.keys():
                 rst_format_dict[k].append(data_sample[k])
             rst_format_dict["result"].append(model_pred)
@@ -381,9 +414,8 @@ def _genai_client_process(
         min_pixels,
         max_pixels,
     ):
-        lock = Lock()
-
-        def _process(item):
+        futures = []
+        for item in data:
             image = item["image"]
             if isinstance(image, str):
                 if image.startswith("http://") or image.startswith("https://"):
@@ -469,27 +501,22 @@ def _process(item):
                         f"{repr(self._genai_client.backend)} does not support `max_pixels`."
                     )
 
-            with lock:
-                future = self._genai_client.create_chat_completion(
-                    [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "image_url", "image_url": {"url": image_url}},
-                                {"type": "text", "text": item["query"]},
-                            ],
-                        }
-                    ],
-                    return_future=True,
-                    timeout=600,
-                    **kwargs,
-                )
-                return future
+            future = self._genai_client.create_chat_completion(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "text", "text": item["query"]},
+                        ],
+                    }
+                ],
+                return_future=True,
+                timeout=600,
+                **kwargs,
+            )
 
-        if len(data) > 1:
-            futures = list(self._thread_pool.map(_process, data))
-        else:
-            futures = [_process(data[0])]
+            futures.append(future)
 
         results = []
         for future in futures:
diff --git a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py
index ae99330e49..1978950a31 100644
--- a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py
+++ b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py
@@ -74,6 +74,8 @@ def __init__(
     def preprocess(
         self,
         input_dicts,
+        min_pixels=None,
+        max_pixels=None,
     ):
         images = [fetch_image(input_dict["image"]) for input_dict in input_dicts]
 
@@ -98,8 +100,18 @@ def preprocess(
             "video_kwargs": copy.deepcopy(self._DEFAULT_VIDEO_KWARGS),
         }
 
+        if min_pixels is not None or max_pixels is not None:
+            size = {
+                "min_pixels": min_pixels or self.image_processor.min_pixels,
+                "max_pixels": max_pixels or self.image_processor.max_pixels,
+            }
+        else:
+            size = None
+
         if images is not None:
-            image_inputs = self.image_processor(images=images, return_tensors="pd")
+            image_inputs = self.image_processor(
+                images=images, size=size, return_tensors="pd"
+            )
             image_inputs["pixel_values"] = image_inputs["pixel_values"]
             image_grid_thw = image_inputs["image_grid_thw"]
 
diff --git a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py
index 393db7764a..ccd65e0bbc 100644
--- a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py
+++ b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py
@@ -146,7 +146,7 @@ def __init__(
         self.patch_size = patch_size
         self.temporal_patch_size = temporal_patch_size
         self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
         self.do_convert_rgb = do_convert_rgb
 
     @classmethod
@@ -160,6 +160,7 @@ def from_pretrained(cls, pretrained_model_dir):
     def _preprocess(
         self,
         images,
+        size: Optional[Dict[str, int]] = None,
         do_resize: Optional[bool] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
@@ -183,8 +184,8 @@ def _preprocess(
                     height,
                     width,
                     factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
+                    min_pixels=size["min_pixels"],
+                    max_pixels=size["max_pixels"],
                 )
 
                 image = image.resize(
@@ -267,6 +268,7 @@ def __call__(
             for image in images:
                 patches, image_grid_thw = self._preprocess(
                     image,
+                    size=size,
                     do_resize=do_resize,
                     do_rescale=do_rescale,
                     rescale_factor=rescale_factor,
diff --git a/paddlex/inference/models/formula_recognition/result.py b/paddlex/inference/models/formula_recognition/result.py
index a318299aba..c3935adcd9 100644
--- a/paddlex/inference/models/formula_recognition/result.py
+++ b/paddlex/inference/models/formula_recognition/result.py
@@ -34,6 +34,7 @@
     import cv2
 if is_dep_available("pypdfium2"):
     import pypdfium2 as pdfium
+    from ...utils.pdfium_lock import pdfium_lock
 
 
 class FormulaRecResult(BaseCVResult):
@@ -276,26 +277,28 @@ def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
     Returns:
         np.ndarray: The resulting image as a NumPy array, or None if the PDF is not single-page.
     """
-    pdfDoc = pdfium.PdfDocument(pdf_path)
-    try:
-        if len(pdfDoc) != 1:
-            return None
-        for page in pdfDoc:
-            rotate = int(0)
-            zoom = 2
-            img = page.render(scale=zoom, rotation=rotate).to_numpy()
-            xywh = crop_white_area(img)
-
-            if xywh is not None:
-                x, y, w, h = xywh
-                img = img[y : y + h, x : x + w]
-                if is_padding:
-                    img = cv2.copyMakeBorder(
-                        img, 30, 30, 30, 30, cv2.BORDER_CONSTANT, value=(255, 255, 255)
-                    )
-                return img
-    finally:
-        pdfDoc.close()
+    with pdfium_lock:
+        pdfDoc = pdfium.PdfDocument(pdf_path)
+        try:
+            if len(pdfDoc) != 1:
+                return None
+            for page in pdfDoc:
+                rotate = int(0)
+                zoom = 2
+                img = page.render(scale=zoom, rotation=rotate).to_numpy()
+                page.close()
+                xywh = crop_white_area(img)
+
+                if xywh is not None:
+                    x, y, w, h = xywh
+                    img = img[y : y + h, x : x + w]
+                    if is_padding:
+                        img = cv2.copyMakeBorder(
+                            img, 30, 30, 30, 30, cv2.BORDER_CONSTANT, value=(255, 255, 255)
+                        )
+                    return img
+        finally:
+            pdfDoc.close()
     return None
 
 
diff --git a/paddlex/inference/models/image_classification/modeling/__init__.py b/paddlex/inference/models/image_classification/modeling/__init__.py
new file mode 100644
index 0000000000..6326ac9c87
--- /dev/null
+++ b/paddlex/inference/models/image_classification/modeling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pplcnet import PPLCNet
diff --git a/paddlex/inference/models/image_classification/modeling/_config.py b/paddlex/inference/models/image_classification/modeling/_config.py
new file mode 100644
index 0000000000..d476043877
--- /dev/null
+++ b/paddlex/inference/models/image_classification/modeling/_config.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...common.transformers.transformers import PretrainedConfig
+
+DEFAULT_CONFIG = {
+    "model_name": "PP-LCNet_x1_0_doc_ori",
+    "scale": 1.0,
+    "class_num": 4,
+    "stride_list": [2, 2, 2, 2, 2],
+    "dropout_prob": 0.2,
+    "class_expand": 1280,
+    "use_last_conv": True,
+    "act": "hardswish",
+    "reduction": 4,
+    "lr_mult_list": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    "net_config": {
+        "blocks2": [[3, 16, 32, 1, False]],
+        "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+        "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+        "blocks5": [
+            [3, 128, 256, 2, False],
+            [5, 256, 256, 1, False],
+            [5, 256, 256, 1, False],
+            [5, 256, 256, 1, False],
+            [5, 256, 256, 1, False],
+            [5, 256, 256, 1, False],
+        ],
+        "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]],
+    },
+}
+
+
+class PPLCNetConfig(PretrainedConfig):
+    model_type = "cls"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.model_name = kwargs.get("model_name", DEFAULT_CONFIG["model_name"])
+        self.scale = kwargs.get("scale", DEFAULT_CONFIG["scale"])
+        self.class_num = kwargs.get("class_num", DEFAULT_CONFIG["class_num"])
+        self.stride_list = kwargs.get("stride_list", DEFAULT_CONFIG["stride_list"])
+        self.reduction = kwargs.get("reduction", DEFAULT_CONFIG["reduction"])
+        self.dropout_prob = kwargs.get("dropout_prob", DEFAULT_CONFIG["dropout_prob"])
+        self.class_expand = kwargs.get("class_expand", DEFAULT_CONFIG["class_expand"])
+        self.use_last_conv = kwargs.get(
+            "use_last_conv", DEFAULT_CONFIG["use_last_conv"]
+        )
+        self.act = kwargs.get("act", DEFAULT_CONFIG["act"])
+        self.lr_mult_list = kwargs.get("lr_mult_list", DEFAULT_CONFIG["lr_mult_list"])
+        self.net_config = kwargs.get("net_config", DEFAULT_CONFIG["net_config"])
diff --git a/paddlex/inference/models/image_classification/modeling/pplcnet.py b/paddlex/inference/models/image_classification/modeling/pplcnet.py
new file mode 100644
index 0000000000..ef6ddd077d
--- /dev/null
+++ b/paddlex/inference/models/image_classification/modeling/pplcnet.py
@@ -0,0 +1,425 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ....utils.benchmark import add_inference_operations, benchmark
+from ...common.transformers.transformers import (
+    BatchNormHFStateDictMixin,
+    PretrainedModel,
+)
+from ._config import PPLCNetConfig
+
+
+def make_divisible(v: float, divisor: int = 8, min_value: Optional[int] = None) -> int:
+    """
+    Ensure the number of channels is a multiple of the specified divisor (common optimization for mobile networks)
+
+    Args:
+        v: Original number of channels
+        divisor: Divisor, default 8
+        min_value: Minimum number of channels, default None (takes divisor)
+
+    Returns:
+        Adjusted number of channels (integer)
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act: str) -> nn.Layer:
+    """
+    Create activation function layer
+
+    Args:
+        act: Activation function name, supports "hardswish" / "relu" / "relu6"
+
+    Returns:
+        Activation function layer instance
+
+    Raises:
+        RuntimeError: Unsupported activation function type
+    """
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act == "relu6":
+        return nn.ReLU6()
+    else:
+        raise RuntimeError("The activation function is not supported: {}".format(act))
+
+
+class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):
+    """
+    AdaptiveAvgPool2D: : Adaptive average pooling layer optimized
+
+    Args:
+        *args: Positional arguments passed to parent class nn.AdaptiveAvgPool2D
+        **kwargs: Keyword arguments passed to parent class nn.AdaptiveAvgPool2D
+
+    Returns:
+        paddle.Tensor: Pooled tensor with shape [N, C, 1, 1] (global pooling) or specified output size
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        if paddle.device.get_device().startswith("npu"):
+            self.device = "npu"
+        else:
+            self.device = None
+
+        if isinstance(self._output_size, int) and self._output_size == 1:
+            self._gap = True
+        elif (
+            isinstance(self._output_size, tuple)
+            and self._output_size[0] == 1
+            and self._output_size[1] == 1
+        ):
+            self._gap = True
+        else:
+            self._gap = False
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.device == "npu" and self._gap:
+            # Global Average Pooling
+            N, C, _, _ = x.shape
+            x_mean = paddle.mean(x, axis=[2, 3])
+            x_mean = paddle.reshape(x_mean, [N, C, 1, 1])
+            return x_mean
+        else:
+            return F.adaptive_avg_pool2d(
+                x,
+                output_size=self._output_size,
+                data_format=self._data_format,
+                name=self._name,
+            )
+
+
+class ConvBNLayer(nn.Layer):
+    """
+    ConvBNLayer: Combination layer of convolution, batch normalization and activation function
+
+    Args:
+        num_channels (int): Number of input channels
+        filter_size (int): Kernel size of convolution layer
+        num_filters (int): Number of output channels
+        stride (int): Stride of convolution layer
+        num_groups (int): Number of groups for grouped convolution, default 1
+        lr_mult (float): Learning rate multiplier for layer parameters, default 1.0
+        act (str): Activation function type, default "hardswish"
+
+    Returns:
+        paddle.Tensor: Output tensor after convolution + batch normalization + activation
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        filter_size: int,
+        num_filters: int,
+        stride: int,
+        num_groups: int = 1,
+        lr_mult: float = 1.0,
+        act: str = "hardswish",
+    ) -> None:
+        super().__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal(), learning_rate=lr_mult),
+            bias_attr=False,
+        )
+
+        self.bn = nn.BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+        )
+        self.act = _create_act(act)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    """
+    DepthwiseSeparable: Depthwise separable convolution layer with optional SE attention module
+
+    Args:
+        num_channels (int): Number of input channels
+        num_filters (int): Number of output channels
+        stride (int): Stride of depthwise convolution layer
+        dw_size (int): Kernel size of depthwise convolution, default 3
+        use_se (bool): Whether to use SE attention module, default False
+        lr_mult (float): Learning rate multiplier for layer parameters, default 1.0
+        act (str): Activation function type, default "hardswish"
+
+    Returns:
+        paddle.Tensor: Output tensor after depthwise separable convolution
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        num_filters: int,
+        stride: int,
+        reduction: int,
+        dw_size: int,
+        use_se: bool,
+        lr_mult: float,
+        act: str,
+    ) -> None:
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels,
+            lr_mult=lr_mult,
+            act=act,
+        )
+        self.se = (
+            SEModule(num_channels, reduction, lr_mult) if use_se else nn.Identity()
+        )
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            lr_mult=lr_mult,
+            act=act,
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.dw_conv(x)
+        x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(nn.Layer):
+    """
+    SEModule: Squeeze-and-Excitation attention module for channel-wise feature recalibration
+
+    Args:
+        channel (int): Number of input channels
+        reduction (int): Channel reduction ratio for SE module, default 4
+        lr_mult (float): Learning rate multiplier for module parameters, default 1.0
+
+    Returns:
+        paddle.Tensor: Attention-weighted tensor after SE module processing
+    """
+
+    def __init__(self, channel: int, reduction: int, lr_mult: float = 1.0) -> None:
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        conv_kwargs = {
+            "kernel_size": 1,
+            "stride": 1,
+            "padding": 0,
+            "weight_attr": ParamAttr(learning_rate=lr_mult),
+            "bias_attr": ParamAttr(learning_rate=lr_mult),
+        }
+        self.conv1 = nn.Conv2D(channel, channel // reduction, **conv_kwargs)
+        self.conv2 = nn.Conv2D(channel // reduction, channel, **conv_kwargs)
+        self.relu = nn.ReLU()
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class PPLCNet(BatchNormHFStateDictMixin, PretrainedModel):
+    """
+    PPLCNet: Lightweight convolutional neural network for image classification tasks
+
+    Args:
+        config (PPLCNetConfig): Configuration instance containing model hyperparameters
+            - scale (float): Channel scale factor for network width adjustment
+            - class_num (int): Number of classification categories
+            - dropout_prob (float): Dropout probability for last convolution layer
+            - class_expand (int): Expansion channel number for last convolution layer
+            - stride_list (List[int]): Stride list for different blocks, length must be 5
+            - use_last_conv (bool): Whether to use last convolution layer before fc
+            - act (str): Activation function type used in network
+            - lr_mult_list (List[float]): Learning rate multipliers for different layers, length must be 6
+            - net_config (Dict[str, Any]): Network configuration dict containing block parameters
+
+    Returns:
+        List[numpy.ndarray]: List containing classification probability numpy array
+    """
+
+    config_class = PPLCNetConfig
+
+    def __init__(self, config: PPLCNetConfig) -> None:
+        super().__init__(config)
+
+        self.scale = config.scale
+        self.class_num = config.class_num
+        self.dropout_prob = config.dropout_prob
+        self.class_expand = config.class_expand
+        self.stride_list = config.stride_list
+        self.reduction = config.reduction
+        self.use_last_conv = config.use_last_conv
+        self.act = config.act
+        self.lr_mult_list = (
+            eval(config.lr_mult_list)
+            if isinstance(config.lr_mult_list, str)
+            else config.lr_mult_list
+        )
+        self.net_config = config.net_config
+
+        assert isinstance(
+            self.lr_mult_list, (list, tuple)
+        ), f"lr_mult_list should be in (list, tuple) but got {type(self.lr_mult_list)}"
+        assert (
+            len(self.lr_mult_list) == 6
+        ), f"lr_mult_list length should be 6 but got {len(self.lr_mult_list)}"
+        assert isinstance(
+            self.stride_list, (list, tuple)
+        ), f"stride_list should be in (list, tuple) but got {type(self.stride_list)}"
+        assert (
+            len(self.stride_list) == 5
+        ), f"stride_list length should be 5 but got {len(self.stride_list)}"
+
+        for i, stride in enumerate(self.stride_list[1:]):
+            self.net_config["blocks{}".format(i + 3)][0][3] = stride
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * self.scale),
+            stride=self.stride_list[0],
+            lr_mult=self.lr_mult_list[0],
+            act=self.act,
+        )
+
+        def _build_block(block_name, lr_idx):
+            return nn.Sequential(
+                *[
+                    DepthwiseSeparable(
+                        num_channels=make_divisible(in_c * self.scale),
+                        num_filters=make_divisible(out_c * self.scale),
+                        dw_size=k,
+                        stride=s,
+                        reduction=self.reduction,
+                        use_se=se,
+                        lr_mult=self.lr_mult_list[lr_idx],
+                        act=self.act,
+                    )
+                    for i, (k, in_c, out_c, s, se) in enumerate(
+                        self.net_config[block_name]
+                    )
+                ]
+            )
+
+        self.blocks2 = _build_block("blocks2", 1)
+        self.blocks3 = _build_block("blocks3", 2)
+        self.blocks4 = _build_block("blocks4", 3)
+        self.blocks5 = _build_block("blocks5", 4)
+        self.blocks6 = _build_block("blocks6", 5)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.last_conv = None
+        if self.use_last_conv:
+            self.last_conv = nn.Conv2D(
+                in_channels=make_divisible(
+                    self.net_config["blocks6"][-1][2] * self.scale
+                ),
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False,
+            )
+            self.act = _create_act(self.act)
+            self.dropout = nn.Dropout(p=self.dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        if self.use_last_conv:
+            fc_in_channels = self.class_expand
+        else:
+            fc_in_channels = make_divisible(
+                self.net_config["blocks6"][-1][2] * self.scale
+            )
+        self.fc = nn.Linear(fc_in_channels, self.class_num)
+        self.out_act = nn.Softmax(axis=-1)
+
+    add_inference_operations("pplcnet_forward")
+
+    @benchmark.timeit_with_options(name="pplcnet_forward")
+    def forward(self, x: List) -> List:
+
+        x = paddle.to_tensor(x[0])
+
+        x = self.conv1(x)
+
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = self.blocks4(x)
+        x = self.blocks5(x)
+        x = self.blocks6(x)
+
+        x = self.avg_pool(x)
+
+        if self.last_conv is not None:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        x = self.out_act(x)
+
+        return [x.cpu().numpy()]
+
+    def get_transpose_weight_keys(self):
+        t_layers = ["fc"]
+        keys = []
+        for key, _ in self.get_hf_state_dict().items():
+            for t_layer in t_layers:
+                if t_layer in key and key.endswith("weight"):
+                    keys.append(key)
+        return keys
diff --git a/paddlex/inference/models/image_classification/predictor.py b/paddlex/inference/models/image_classification/predictor.py
index 68a9f93082..7e31158ee3 100644
--- a/paddlex/inference/models/image_classification/predictor.py
+++ b/paddlex/inference/models/image_classification/predictor.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ....modules.image_classification.model_list import MODELS
+from ....utils.device import TemporaryDeviceChanger
 from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
 from ...common.reader import ReadImage
@@ -46,6 +47,7 @@ def __init__(
         """
         super().__init__(*args, **kwargs)
         self.topk = topk
+        self.device = kwargs.get("device", None)
         self.preprocessors, self.infer, self.postprocessors = self._build()
 
     def _build_batch_sampler(self) -> ImageBatchSampler:
@@ -79,8 +81,26 @@ def _build(self) -> Tuple:
             preprocessors[name] = op
         preprocessors["ToBatch"] = ToBatch()
 
-        infer = self.create_static_infer()
-
+        if self._use_static_model:
+            infer = self.create_static_infer()
+        else:
+            from .modeling import PPLCNet
+
+            if self.model_name in [
+                "PP-LCNet_x1_0_doc_ori",
+                "PP-LCNet_x1_0_table_cls",
+                "PP-LCNet_x0_25_textline_ori",
+            ]:
+                with TemporaryDeviceChanger(self.device):
+                    infer = PPLCNet.from_pretrained(
+                        self.model_dir, use_safetensors=True, convert_from_hf=True
+                    )
+                infer.eval()
+
+            else:
+                raise RuntimeError(
+                    f"There is no dynamic graph implementation for model {repr(self.model_name)}."
+                )
         postprocessors = {}
         for key in self.config["PostProcess"]:
             func = self._FUNC_MAP.get(key)
@@ -109,7 +129,11 @@ def process(
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         x = self.preprocessors["ToBatch"](imgs=batch_imgs)
-        batch_preds = self.infer(x=x)
+        if self._use_static_model:
+            batch_preds = self.infer(x=x)
+        else:
+            with TemporaryDeviceChanger(self.device):
+                batch_preds = self.infer(x=x)
         batch_class_ids, batch_scores, batch_label_names = self.postprocessors["Topk"](
             batch_preds, topk=topk or self.topk
         )
diff --git a/paddlex/inference/models/image_unwarping/modeling/__init__.py b/paddlex/inference/models/image_unwarping/modeling/__init__.py
new file mode 100644
index 0000000000..6f006b11e6
--- /dev/null
+++ b/paddlex/inference/models/image_unwarping/modeling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .uvdoc import UVDocNet
diff --git a/paddlex/inference/models/image_unwarping/modeling/_config.py b/paddlex/inference/models/image_unwarping/modeling/_config.py
new file mode 100644
index 0000000000..0b497e277e
--- /dev/null
+++ b/paddlex/inference/models/image_unwarping/modeling/_config.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...common.transformers.transformers import PretrainedConfig
+
+DEFAULT_CONFIG = {
+    "model_name": "UVDoc",
+    "num_filter": 32,
+    "in_channels": 3,
+    "kernel_size": 5,
+    "stride": [1, 2, 2, 2],
+    "map_num": [1, 2, 4, 8, 16],
+    "block_nums": [3, 4, 6, 3],
+    "dilation_values": {
+        "bridge_1": 1,
+        "bridge_2": 2,
+        "bridge_3": 5,
+        "bridge_4": [8, 3, 2],
+        "bridge_5": [12, 7, 4],
+        "bridge_6": [18, 12, 6],
+    },
+    "padding_mode": "reflect",
+    "upsample_size": [712, 488],
+    "upsample_mode": "bilinear",
+}
+
+
+class UVDocNetConfig(PretrainedConfig):
+    model_type = "uvdoc"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.model_name = kwargs.get("model_name", DEFAULT_CONFIG["model_name"])
+        self.num_filter = kwargs.get("num_filter", DEFAULT_CONFIG["num_filter"])
+        self.in_channels = kwargs.get("in_channels", DEFAULT_CONFIG["in_channels"])
+        self.kernel_size = kwargs.get("kernel_size", DEFAULT_CONFIG["kernel_size"])
+        self.stride = kwargs.get("stride", DEFAULT_CONFIG["stride"])
+        self.map_num = kwargs.get("map_num", DEFAULT_CONFIG["map_num"])
+        self.block_nums = kwargs.get("block_nums", DEFAULT_CONFIG["block_nums"])
+        self.dilation_values = kwargs.get(
+            "dilation_values", DEFAULT_CONFIG["dilation_values"]
+        )
+        self.padding_mode = kwargs.get("padding_mode", DEFAULT_CONFIG["padding_mode"])
+        self.upsample_size = kwargs.get(
+            "upsample_size", DEFAULT_CONFIG["upsample_size"]
+        )
+        self.upsample_mode = kwargs.get(
+            "upsample_mode", DEFAULT_CONFIG["upsample_mode"]
+        )
diff --git a/paddlex/inference/models/image_unwarping/modeling/uvdoc.py b/paddlex/inference/models/image_unwarping/modeling/uvdoc.py
new file mode 100644
index 0000000000..5c2f49633d
--- /dev/null
+++ b/paddlex/inference/models/image_unwarping/modeling/uvdoc.py
@@ -0,0 +1,421 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.benchmark import add_inference_operations, benchmark
+from ...common.transformers.transformers import (
+    BatchNormHFStateDictMixin,
+    PretrainedModel,
+)
+from ._config import UVDocNetConfig
+
+
+def conv3x3(
+    in_channels: int, out_channels: int, kernel_size: int, stride: int = 1
+) -> nn.Conv2D:
+    """
+    conv3x3: 3x3 convolution layer with same padding
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of convolution layer
+        stride (int, optional): Convolution stride, default is 1
+
+    Returns:
+        nn.Conv2D: Convolutional layer with same padding (padding = kernel_size // 2)
+    """
+
+    return nn.Conv2D(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=kernel_size // 2,
+    )
+
+
+def dilated_conv_bn_act(
+    in_channels: int, out_channels: int, dilation: int
+) -> nn.Sequential:
+    """
+    dilated_conv_bn_act: Dilated convolution block with BN and ReLU activation
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        dilation (int): Dilation rate for dilated convolution
+
+    Returns:
+        nn.Sequential: Sequential block containing dilated conv, BN and ReLU layers
+    """
+
+    model = nn.Sequential(
+        nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            bias_attr=False,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            dilation=dilation,
+        ),
+        nn.BatchNorm2D(out_channels),
+        nn.ReLU(),
+    )
+    return model
+
+
+def dilated_conv(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    dilation: int,
+    stride: int = 1,
+) -> nn.Sequential:
+    """
+    dilated_conv: Pure dilated convolution layer for multi-scale feature learning
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of dilated convolution
+        dilation (int): Dilation rate for dilated convolution
+        stride (int, optional): Convolution stride, default is 1
+
+    Returns:
+        nn.Sequential: Sequential block containing only the dilated convolution layer
+    """
+    model = nn.Sequential(
+        nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=dilation * (kernel_size // 2),
+            dilation=dilation,
+        )
+    )
+    return model
+
+
+class ResidualBlockWithDilation(nn.Layer):
+    """
+    ResidualBlockWithDilation: Residual block with optional dilated convolution and downsampling
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of convolution layers
+        stride (int, optional): Convolution stride for first conv layer, default is 1
+        downsample (Optional[nn.Layer]): Downsampling layer for residual connection, default is None
+        is_activation (bool, optional): Whether to apply activation (unused in current implementation), default is True
+        is_top (bool, optional): Whether it is the top block (uses standard conv instead of dilated conv), default is False
+
+    Returns:
+        paddle.Tensor: Output tensor after residual block with conv/dilated conv and ReLU activation
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        downsample: Optional[nn.Layer] = None,
+        is_activation: bool = True,
+        is_top: bool = False,
+    ):
+        super(ResidualBlockWithDilation, self).__init__()
+        self.stride = stride
+        self.downsample = downsample
+        self.is_activation = is_activation
+        self.is_top = is_top
+        if self.stride != 1 or self.is_top:
+            self.conv1 = conv3x3(in_channels, out_channels, kernel_size, self.stride)
+            self.conv2 = conv3x3(out_channels, out_channels, kernel_size)
+        else:
+            self.conv1 = dilated_conv(
+                in_channels, out_channels, kernel_size, dilation=3
+            )
+            self.conv2 = dilated_conv(
+                out_channels, out_channels, kernel_size, dilation=3
+            )
+        self.bn1 = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+        self.bn2 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        residual = x
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out1 = self.relu(self.bn1(self.conv1(x)))
+        out2 = self.bn2(self.conv2(out1))
+        out2 += residual
+        out = self.relu(out2)
+        return out
+
+
+class ResnetStraight(nn.Layer):
+    """
+    ResnetStraight: Straightforward ResNet architecture with residual blocks and optional dilated convolution
+
+    Args:
+        num_filter (int): Base number of filters/channels for the network
+        map_num (List[int]): List of channel scaling factors for each layer
+        block_nums (List[int]): List of residual block numbers for each layer
+        kernel_size (int): Kernel size of convolution layers in residual blocks
+        stride (List[int]): List of stride values for each layer's first residual block
+
+    Returns:
+        paddle.Tensor: Output tensor from the third residual layer of the ResNet
+    """
+
+    def __init__(
+        self,
+        num_filter: int,
+        map_num: List[int],
+        block_nums: List[int],
+        kernel_size: int,
+        stride: List[int],
+    ):
+        super(ResnetStraight, self).__init__()
+        self.in_channels = num_filter * map_num[0]
+        self.stride = stride
+        self.block_nums = block_nums
+        self.kernel_size = kernel_size
+
+        for layer_idx, (map_num_val, block_num, stride_val) in enumerate(
+            zip(map_num[:3], block_nums[:3], stride[:3])
+        ):
+            layer = self.blocklayer(
+                num_filter * map_num_val,
+                block_num,
+                kernel_size=self.kernel_size,
+                stride=stride_val,
+            )
+            setattr(self, f"layer{layer_idx + 1}", layer)
+
+    def blocklayer(
+        self, out_channels: int, block_nums: int, kernel_size: int, stride: int = 1
+    ) -> nn.Sequential:
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels:
+            downsample = nn.Sequential(
+                conv3x3(
+                    self.in_channels,
+                    out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                ),
+                nn.BatchNorm2D(out_channels),
+            )
+
+        layers = []
+        for i in range(block_nums):
+            layers.append(
+                ResidualBlockWithDilation(
+                    in_channels=self.in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride if i == 0 else 1,
+                    downsample=downsample if i == 0 else None,
+                    is_top=True if i == 0 else False,
+                )
+            )
+        self.in_channels = out_channels
+        return nn.Sequential(*layers)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        return out3
+
+
+class UVDocNet(BatchNormHFStateDictMixin, PretrainedModel):
+    """
+    UVDocNet: Image rectification network based on ResNet and multi-scale dilated convolution
+
+    Args:
+        config (UVDocNetConfig): Configuration object containing network hyperparameters
+
+    Returns:
+        List: List containing the transformed document image tensor (converted to numpy array on CPU)
+    """
+
+    config_class = UVDocNetConfig
+
+    def __init__(self, config: UVDocNetConfig):
+        super(UVDocNet, self).__init__(config)
+
+        self.num_filter = config.num_filter
+        self.in_channels = config.in_channels
+        self.kernel_size = config.kernel_size
+        self.stride = config.stride
+        self.map_num = config.map_num
+        self.block_nums = config.block_nums
+        self.dilation_values = config.dilation_values
+        self.padding_mode = config.padding_mode
+        self.upsample_size = config.upsample_size
+        self.upsample_mode = config.upsample_mode
+
+        self.resnet_head = nn.Sequential(
+            nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.num_filter * self.map_num[0],
+                bias_attr=False,
+                kernel_size=self.kernel_size,
+                stride=2,
+                padding=self.kernel_size // 2,
+            ),
+            nn.BatchNorm2D(self.num_filter * self.map_num[0]),
+            nn.ReLU(),
+            nn.Conv2D(
+                in_channels=self.num_filter * self.map_num[0],
+                out_channels=self.num_filter * self.map_num[0],
+                bias_attr=False,
+                kernel_size=self.kernel_size,
+                stride=2,
+                padding=self.kernel_size // 2,
+            ),
+            nn.BatchNorm2D(self.num_filter * self.map_num[0]),
+            nn.ReLU(),
+        )
+
+        self.resnet_down = ResnetStraight(
+            self.num_filter,
+            self.map_num,
+            block_nums=self.block_nums,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+        )
+
+        bridge_in_channels = self.num_filter * self.map_num[2]
+
+        def _build_bridge(bridge_key: str) -> nn.Sequential:
+            dilation = self.dilation_values[bridge_key]
+            if isinstance(dilation, int):
+                return nn.Sequential(
+                    dilated_conv_bn_act(
+                        bridge_in_channels, bridge_in_channels, dilation=dilation
+                    )
+                )
+            else:
+                return nn.Sequential(
+                    *[
+                        dilated_conv_bn_act(
+                            bridge_in_channels, bridge_in_channels, dilation=d
+                        )
+                        for d in dilation
+                    ]
+                )
+
+        self.bridge_1 = _build_bridge("bridge_1")
+        self.bridge_2 = _build_bridge("bridge_2")
+        self.bridge_3 = _build_bridge("bridge_3")
+        self.bridge_4 = _build_bridge("bridge_4")
+        self.bridge_5 = _build_bridge("bridge_5")
+        self.bridge_6 = _build_bridge("bridge_6")
+
+        self.bridge_concat = nn.Sequential(
+            nn.Conv2D(
+                in_channels=self.num_filter * self.map_num[2] * 6,
+                out_channels=self.num_filter * self.map_num[2],
+                bias_attr=False,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.BatchNorm2D(self.num_filter * self.map_num[2]),
+            nn.ReLU(),
+        )
+
+        self.out_point_positions2D = nn.Sequential(
+            nn.Conv2D(
+                in_channels=self.num_filter * self.map_num[2],
+                out_channels=self.num_filter * self.map_num[0],
+                bias_attr=False,
+                kernel_size=self.kernel_size,
+                stride=1,
+                padding=self.kernel_size // 2,
+                padding_mode=self.padding_mode,
+            ),
+            nn.BatchNorm2D(self.num_filter * self.map_num[0]),
+            nn.PReLU(),
+            nn.Conv2D(
+                in_channels=self.num_filter * self.map_num[0],
+                out_channels=2,
+                kernel_size=self.kernel_size,
+                stride=1,
+                padding=self.kernel_size // 2,
+                padding_mode=self.padding_mode,
+            ),
+        )
+
+    add_inference_operations("uvdoc_forward")
+
+    @benchmark.timeit_with_options(name="uvdoc_forward")
+    def forward(self, x: Any) -> List[paddle.Tensor]:
+        x = paddle.to_tensor(x[0])
+
+        image = x
+        h_ori, w_ori = x.shape[2:]
+        x = F.upsample(
+            x,
+            size=(self.upsample_size[0], self.upsample_size[1]),
+            mode=self.upsample_mode,
+            align_corners=True,
+        )
+        resnet_head = self.resnet_head(x)
+        resnet_down = self.resnet_down(resnet_head)
+
+        bridge_1 = self.bridge_1(resnet_down)
+        bridge_2 = self.bridge_2(resnet_down)
+        bridge_3 = self.bridge_3(resnet_down)
+        bridge_4 = self.bridge_4(resnet_down)
+        bridge_5 = self.bridge_5(resnet_down)
+        bridge_6 = self.bridge_6(resnet_down)
+
+        bridge_concat = paddle.concat(
+            x=[bridge_1, bridge_2, bridge_3, bridge_4, bridge_5, bridge_6], axis=1
+        )
+        bridge = self.bridge_concat(bridge_concat)
+        out_point_positions2D = self.out_point_positions2D(bridge)
+
+        bm_up = F.upsample(
+            out_point_positions2D,
+            size=(h_ori, w_ori),
+            mode=self.upsample_mode,
+            align_corners=True,
+        )
+        bm = bm_up.transpose([0, 2, 3, 1])
+        out = F.grid_sample(image, bm, align_corners=True)
+
+        return [out.cpu().numpy()]
+
+    def _get_forward_key_rules(self):
+        default_rules = super()._get_forward_key_rules()
+        custom_rules = [("out_point_positions2D.2._weight", "_weight", "weight")]
+        return default_rules + custom_rules
+
+    def _get_reverse_key_rules(self):
+        default_rules = super()._get_reverse_key_rules()
+        custom_rules = [("out_point_positions2D.2.weight", "weight", "_weight")]
+        return default_rules + custom_rules
diff --git a/paddlex/inference/models/image_unwarping/predictor.py b/paddlex/inference/models/image_unwarping/predictor.py
index fea09dc8a6..20c8b22214 100644
--- a/paddlex/inference/models/image_unwarping/predictor.py
+++ b/paddlex/inference/models/image_unwarping/predictor.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ....modules.image_unwarping.model_list import MODELS
+from ....utils.device import TemporaryDeviceChanger
 from ...common.batch_sampler import ImageBatchSampler
 from ...common.reader import ReadImage
 from ..base import BasePredictor
@@ -38,6 +39,7 @@ def __init__(self, *args: List, **kwargs: Dict) -> None:
             **kwargs: Arbitrary keyword arguments passed to the superclass.
         """
         super().__init__(*args, **kwargs)
+        self.device = kwargs.get("device", None)
         self.preprocessors, self.infer, self.postprocessors = self._build()
 
     def _build_batch_sampler(self) -> ImageBatchSampler:
@@ -66,8 +68,16 @@ def _build(self) -> Tuple:
         preprocessors["Normalize"] = Normalize(mean=0.0, std=1.0, scale=1.0 / 255)
         preprocessors["ToCHW"] = ToCHWImage()
         preprocessors["ToBatch"] = ToBatch()
+        if self._use_static_model:
+            infer = self.create_static_infer()
+        else:
+            from .modeling import UVDocNet
 
-        infer = self.create_static_infer()
+            with TemporaryDeviceChanger(self.device):
+                infer = UVDocNet.from_pretrained(
+                    self.model_dir, use_safetensors=True, convert_from_hf=True
+                )
+            infer.eval()
 
         postprocessors = {"DocTrPostProcess": DocTrPostProcess()}
         return preprocessors, infer, postprocessors
@@ -86,7 +96,11 @@ def process(self, batch_data: List[Union[str, np.ndarray]]) -> Dict[str, Any]:
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         x = self.preprocessors["ToBatch"](imgs=batch_imgs)
-        batch_preds = self.infer(x=x)
+        if self._use_static_model:
+            batch_preds = self.infer(x=x)
+        else:
+            with TemporaryDeviceChanger(self.device):
+                batch_preds = self.infer(x=x)
         batch_warp_preds = self.postprocessors["DocTrPostProcess"](batch_preds)
 
         return {
diff --git a/paddlex/inference/models/common/vlm/transformers/__init__.py b/paddlex/inference/models/layout_analysis/__init__.py
similarity index 87%
rename from paddlex/inference/models/common/vlm/transformers/__init__.py
rename to paddlex/inference/models/layout_analysis/__init__.py
index 4badddda5f..c664053a49 100644
--- a/paddlex/inference/models/common/vlm/transformers/__init__.py
+++ b/paddlex/inference/models/layout_analysis/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .configuration_utils import PretrainedConfig
-from .model_utils import PretrainedModel
+from .predictor import LayoutAnalysisPredictor
diff --git a/paddlex/inference/models/layout_analysis/predictor.py b/paddlex/inference/models/layout_analysis/predictor.py
new file mode 100644
index 0000000000..4334cda112
--- /dev/null
+++ b/paddlex/inference/models/layout_analysis/predictor.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Optional, Tuple, Union
+
+from ....modules.object_detection.model_list import LAYOUTANALYSIS_MODELS
+from ..object_detection import DetPredictor
+from ..object_detection.processors import Resize, ToBatch
+from .processors import LayoutAnalysisProcess
+from .result import LayoutAnalysisResult
+from .utils import STATIC_SHAPE_MODEL_LIST
+
+
+class LayoutAnalysisPredictor(DetPredictor):
+
+    entities = LAYOUTANALYSIS_MODELS
+
+    def __init__(
+        self,
+        *args,
+        img_size: Optional[Union[int, Tuple[int, int]]] = None,
+        **kwargs,
+    ):
+        """Initializes LayoutAnalysisPredictor.
+        Args:
+            *args: Arbitrary positional arguments passed to the superclass.
+            img_size (Optional[Union[int, Tuple[int, int]]], optional): The input image size (w, h). Defaults to None.
+            threshold (Optional[float], optional): The threshold for filtering out low-confidence predictions.
+                Defaults to None.
+            layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
+            layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
+                Defaults to None.
+                If it's a single number, then both width and height are used.
+                If it's a tuple of two numbers, then they are used separately for width and height respectively.
+                If it's None, then no unclipping will be performed.
+            layout_merge_bboxes_mode (Optional[Union[str, dict]], optional): The mode for merging bounding boxes. Defaults to None.
+            **kwargs: Arbitrary keyword arguments passed to the superclass.
+        """
+        if img_size is not None:
+            assert (
+                self.model_name not in STATIC_SHAPE_MODEL_LIST
+            ), f"The model {self.model_name} is not supported set input shape"
+            if isinstance(img_size, int):
+                img_size = (img_size, img_size)
+            elif isinstance(img_size, (tuple, list)):
+                assert len(img_size) == 2, f"The length of `img_size` should be 2."
+            else:
+                raise ValueError(
+                    f"The type of `img_size` must be int or Tuple[int, int], but got {type(img_size)}."
+                )
+        super().__init__(*args, **kwargs)
+
+    def _get_result_class(self):
+        return LayoutAnalysisResult
+
+    def process(
+        self,
+        batch_data: List[Any],
+        threshold: Optional[Union[float, dict]] = None,
+        layout_nms: bool = False,
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
+        layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
+        layout_shape_mode: Optional[str] = "auto",
+        filter_overlap_boxes: Optional[bool] = True,
+        skip_order_labels: Optional[List[str]] = None,
+    ):
+        """
+        Process a batch of data through the preprocessing, inference, and postprocessing.
+
+        Args:
+            batch_data (List[Union[str, np.ndarray], ...]): A batch of input data (e.g., image file paths).
+            threshold (Optional[float, dict], optional): The threshold for filtering out low-confidence predictions.
+            layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to None.
+            layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
+            layout_merge_bboxes_mode (Optional[Union[str, dict]], optional): The mode for merging bounding boxes. Defaults to None.
+            layout_shape_mode (Optional[str], optional): The mode for layout shape. Defaults to "auto", [ "rect", "quad","poly", "auto"]. are supported.
+            filter_overlap_boxes (Optional[bool], optional): Whether to filter out overlap boxes. Defaults to True.
+            skip_order_labels (Optional[List[str]], optional): The labels to skip order. Defaults to None.
+
+        Returns:
+            dict: A dictionary containing the input path, raw image, class IDs, scores, and label names
+                for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
+        """
+        datas = batch_data.instances
+        # preprocess
+        for pre_op in self.pre_ops[:-1]:
+            datas = pre_op(datas)
+
+        # use `ToBatch` format batch inputs
+        batch_inputs = self.pre_ops[-1](datas)
+
+        # do infer
+        batch_preds = self.infer(batch_inputs)
+
+        # process a batch of predictions into a list of single image result
+        preds_list = self._format_output(batch_preds)
+        # postprocess
+        boxes = self.post_op(
+            preds_list,
+            datas,
+            threshold=threshold if threshold is not None else self.threshold,
+            layout_nms=layout_nms or self.layout_nms,
+            layout_unclip_ratio=layout_unclip_ratio or self.layout_unclip_ratio,
+            layout_merge_bboxes_mode=layout_merge_bboxes_mode
+            or self.layout_merge_bboxes_mode,
+            layout_shape_mode=layout_shape_mode,
+            filter_overlap_boxes=filter_overlap_boxes,
+            skip_order_labels=skip_order_labels,
+        )
+
+        return {
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
+            "input_img": [data["ori_img"] for data in datas],
+            "boxes": boxes,
+        }
+
+    @DetPredictor.register("Resize")
+    def build_resize(self, target_size, keep_ratio=False, interp=2):
+        assert target_size
+        self.target_size = target_size
+        if isinstance(interp, int):
+            interp = {
+                0: "NEAREST",
+                1: "LINEAR",
+                2: "BICUBIC",
+                3: "AREA",
+                4: "LANCZOS4",
+            }[interp]
+        op = Resize(target_size=target_size[::-1], keep_ratio=keep_ratio, interp=interp)
+        return op
+
+    def build_to_batch(self):
+        models_required_imgsize = [
+            "PP-DocLayoutV2",
+            "PP-DocLayoutV3",
+        ]
+        if any(name in self.model_name for name in models_required_imgsize):
+            ordered_required_keys = (
+                "img_size",
+                "img",
+                "scale_factors",
+            )
+        else:
+            ordered_required_keys = ("img", "scale_factors")
+
+        return ToBatch(ordered_required_keys=ordered_required_keys)
+
+    def build_postprocess(self):
+        if self.threshold is None:
+            self.threshold = self.config.get("draw_threshold", 0.5)
+        if not self.layout_nms:
+            self.layout_nms = self.config.get("layout_nms", None)
+        if self.layout_unclip_ratio is None:
+            self.layout_unclip_ratio = self.config.get("layout_unclip_ratio", None)
+        if self.layout_merge_bboxes_mode is None:
+            self.layout_merge_bboxes_mode = self.config.get(
+                "layout_merge_bboxes_mode", None
+            )
+        scale_size = getattr(self, "target_size", [800, 800])
+        return LayoutAnalysisProcess(
+            labels=self.config["label_list"], scale_size=scale_size
+        )
diff --git a/paddlex/inference/models/layout_analysis/processors.py b/paddlex/inference/models/layout_analysis/processors.py
new file mode 100644
index 0000000000..d1672a2000
--- /dev/null
+++ b/paddlex/inference/models/layout_analysis/processors.py
@@ -0,0 +1,949 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from numpy import ndarray
+
+from ....utils import logging
+from ....utils.deps import function_requires_deps, is_dep_available
+from ...utils.benchmark import benchmark
+from ..object_detection.processors import check_containment, nms
+
+if is_dep_available("opencv-contrib-python"):
+    import cv2
+
+Boxes = List[dict]
+Number = Union[int, float]
+
+
+SKIP_ORDER_LABELS = [
+    "figure_title",
+    "vision_footnote",
+    "image",
+    "chart",
+    "table",
+    "header",
+    "header_image",
+    "footer",
+    "footer_image",
+    "footnote",
+    "aside_text",
+]
+
+
+def is_convex(p_prev, p_curr, p_next):
+    """
+    Calculate if the polygon is convex.
+    """
+    v1 = p_curr - p_prev
+    v2 = p_next - p_curr
+    cross = v1[0] * v2[1] - v1[1] * v2[0]
+    return cross < 0
+
+
+def angle_between_vectors(v1, v2):
+    """
+    Calculate the angle between two vectors.
+    """
+
+    unit_v1 = v1 / np.linalg.norm(v1)
+    unit_v2 = v2 / np.linalg.norm(v2)
+    dot_prod = np.clip(np.dot(unit_v1, unit_v2), -1.0, 1.0)
+    angle_rad = np.arccos(dot_prod)
+    return np.degrees(angle_rad)
+
+
+def calc_new_point(p_curr, v1, v2, distance=20):
+    """
+    Calculate the new point based on the direction of two vectors.
+    """
+    dir_vec = v1 / np.linalg.norm(v1) + v2 / np.linalg.norm(v2)
+    dir_vec = dir_vec / np.linalg.norm(dir_vec)
+    p_new = p_curr + dir_vec * distance
+    return p_new
+
+
+def extract_custom_vertices(
+    polygon, max_allowed_dist, sharp_angle_thresh=45, max_dist_ratio=0.3
+):
+    poly = np.array(polygon)
+    n = len(poly)
+    max_allowed_dist *= max_dist_ratio
+
+    point_info = []
+    for i in range(n):
+        p_prev, p_curr, p_next = poly[(i - 1) % n], poly[i], poly[(i + 1) % n]
+        v1, v2 = p_prev - p_curr, p_next - p_curr
+        is_convex_point = is_convex(p_prev, p_curr, p_next)
+        angle = angle_between_vectors(v1, v2)
+        point_info.append(
+            {
+                "index": i,
+                "is_convex": is_convex_point,
+                "angle": angle,
+                "v1": v1,
+                "v2": v2,
+            }
+        )
+
+    concave_indices = [i for i, info in enumerate(point_info) if not info["is_convex"]]
+    preserve_concave = set()
+
+    if concave_indices:
+        groups = []
+        current_group = [concave_indices[0]]
+
+        for i in range(1, len(concave_indices)):
+            if concave_indices[i] - concave_indices[i - 1] == 1 or (
+                concave_indices[i - 1] == n - 1 and concave_indices[i] == 0
+            ):
+                current_group.append(concave_indices[i])
+            else:
+                if len(current_group) >= 2:
+                    groups.extend(current_group)
+                current_group = [concave_indices[i]]
+
+        if len(current_group) >= 2:
+            groups.extend(current_group)
+
+        if (
+            len(concave_indices) >= 2
+            and concave_indices[0] == 0
+            and concave_indices[-1] == n - 1
+        ):
+            if 0 in groups and n - 1 in groups:
+                preserve_concave.update(groups)
+        else:
+            preserve_concave.update(groups)
+
+    kept_points = [
+        i
+        for i, info in enumerate(point_info)
+        if info["is_convex"] or (i in preserve_concave and info["angle"] >= 120)
+    ]
+
+    final_points = []
+    for idx in range(len(kept_points)):
+        current_idx = kept_points[idx]
+        next_idx = kept_points[(idx + 1) % len(kept_points)]
+        final_points.append(current_idx)
+
+        dist = np.linalg.norm(poly[current_idx] - poly[next_idx])
+        if dist > max_allowed_dist:
+            intermediate = (
+                list(range(current_idx + 1, next_idx))
+                if next_idx > current_idx
+                else list(range(current_idx + 1, n)) + list(range(0, next_idx))
+            )
+
+            if intermediate:
+                num_needed = int(np.ceil(dist / max_allowed_dist)) - 1
+                if len(intermediate) <= num_needed:
+                    final_points.extend(intermediate)
+                else:
+                    step = len(intermediate) / num_needed
+                    final_points.extend(
+                        [intermediate[int(i * step)] for i in range(num_needed)]
+                    )
+
+    final_points = sorted(set(final_points))
+    res = []
+
+    for i in final_points:
+        info = point_info[i]
+        p_curr = poly[i]
+
+        if info["is_convex"] and abs(info["angle"] - sharp_angle_thresh) < 1:
+            v1_norm = info["v1"] / np.linalg.norm(info["v1"])
+            v2_norm = info["v2"] / np.linalg.norm(info["v2"])
+            dir_vec = v1_norm + v2_norm
+            dir_vec /= np.linalg.norm(dir_vec)
+            d = (np.linalg.norm(info["v1"]) + np.linalg.norm(info["v2"])) / 2
+            res.append(tuple(p_curr + dir_vec * d))
+        else:
+            res.append(tuple(p_curr))
+
+    return res
+
+
+@function_requires_deps("opencv-contrib-python")
+def mask2polygon(mask, max_allowed_dist, epsilon_ratio=0.004, extract_custom=True):
+    """
+    Postprocess mask by removing small noise.
+    Args:
+        mask (ndarray): The input mask of shape [H, W].
+        epsilon_ratio (float): The ratio of epsilon.
+    Returns:
+        ndarray: The output mask after postprocessing.
+    """
+    cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    if not cnts:
+        return None
+
+    cnt = max(cnts, key=cv2.contourArea)
+    epsilon = epsilon_ratio * cv2.arcLength(cnt, True)
+    approx_cnt = cv2.approxPolyDP(cnt, epsilon, True)
+    polygon_points = approx_cnt.squeeze()
+    polygon_points = np.atleast_2d(polygon_points)
+    if extract_custom:
+        polygon_points = extract_custom_vertices(polygon_points, max_allowed_dist)
+
+    return polygon_points
+
+
+def extract_polygon_points_by_masks(boxes, masks, scale_ratio, layout_shape_mode):
+    """
+    修改后的提取函数：auto 模式下信任几何决策
+    """
+    scale_w, scale_h = scale_ratio[0] / 4, scale_ratio[1] / 4
+    h_m, w_m = masks.shape[1:]
+    polygon_points = []
+    iou_threshold = 0.95
+
+    max_box_w = max(boxes[:, 4] - boxes[:, 3])
+
+    for i in range(len(boxes)):
+        x_min, y_min, x_max, y_max = boxes[i, 2:6].astype(np.int32)
+        box_w, box_h = x_max - x_min, y_max - y_min
+
+        # default rect
+        rect = np.array(
+            [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]],
+            dtype=np.float32,
+        )
+
+        if box_w <= 0 or box_h <= 0:
+            polygon_points.append(rect)
+            continue
+
+        # crop mask
+        x_s = np.clip(
+            [int(round(x_min * scale_w)), int(round(x_max * scale_w))], 0, w_m
+        )
+        y_s = np.clip(
+            [int(round(y_min * scale_h)), int(round(y_max * scale_h))], 0, h_m
+        )
+
+        cropped = masks[i, y_s[0] : y_s[1], x_s[0] : x_s[1]]
+        if cropped.size == 0 or np.sum(cropped) == 0:
+            polygon_points.append(rect)
+            continue
+
+        if layout_shape_mode == "rect":
+            polygon_points.append(rect)
+            continue
+
+        # resize mask to match box size
+        resized_mask = cv2.resize(
+            cropped.astype(np.uint8), (box_w, box_h), interpolation=cv2.INTER_NEAREST
+        )
+
+        if box_w > max_box_w * 0.6:
+            max_allowed_dist = box_w
+        else:
+            max_allowed_dist = max_box_w
+
+        polygon = mask2polygon(resized_mask, max_allowed_dist)
+        if polygon is not None and len(polygon) < 4:
+            polygon_points.append(rect)
+            continue
+        if polygon is not None and len(polygon) > 0:
+            polygon = polygon + np.array([x_min, y_min])
+        if layout_shape_mode == "poly":
+            polygon_points.append(polygon)
+        elif layout_shape_mode == "quad":
+            # convert polygon to quadrilateral
+            quad = convert_polygon_to_quad(polygon)
+            polygon_points.append(quad if quad is not None else rect)
+        elif layout_shape_mode == "auto":
+            iou_threshold = 0.8
+
+            rect_list = rect.tolist()
+            quad = convert_polygon_to_quad(polygon)
+            if quad is not None:
+                quad_list = quad.tolist()
+
+                iou_quad = calculate_polygon_overlap_ratio(
+                    rect_list,
+                    quad_list,
+                    mode="union",
+                )
+                if iou_quad >= 0.95:
+                    # if quad is very similar to rect, use rect instead
+                    quad = rect
+
+                poly_list = (
+                    polygon.tolist() if isinstance(polygon, np.ndarray) else polygon
+                )
+
+                iou_quad = calculate_polygon_overlap_ratio(
+                    poly_list, quad_list, mode="union"
+                )
+
+                pre_poly = polygon_points[-1] if len(polygon_points) > 0 else None
+                iou_pre = 0
+                if pre_poly is not None:
+                    iou_pre = calculate_polygon_overlap_ratio(
+                        pre_poly.tolist(),
+                        rect_list,
+                        mode="small",
+                    )
+
+                if iou_quad >= iou_threshold and iou_pre < 0.01:
+                    # if quad is similar to polygon, use quad
+                    polygon_points.append(quad)
+                    continue
+
+            # if all ious are less than threshold, use polygon
+            polygon_points.append(polygon)
+        else:
+            raise ValueError(
+                "layout_shape_mode must be one of ['rect', 'poly', 'quad', 'auto']"
+            )
+
+    return polygon_points
+
+
+def convert_polygon_to_quad(polygon):
+    """
+    Convert polygon to minimum bounding rectangle (quad).
+    Args:
+        polygon (ndarray): The polygon points of shape [N, 2].
+    Returns:
+        quad (ndarray): The 4-point quad, clockwise from top-left, or None if invalid.
+    """
+    if polygon is None or len(polygon) < 3:
+        return None
+
+    points = np.array(polygon, dtype=np.float32)
+    if len(points.shape) == 1:
+        points = points.reshape(-1, 2)
+
+    min_rect = cv2.minAreaRect(points)
+    quad = cv2.boxPoints(min_rect)
+
+    center = quad.mean(axis=0)
+    angles = np.arctan2(quad[:, 1] - center[1], quad[:, 0] - center[0])
+    sorted_indices = np.argsort(angles)
+    quad = quad[sorted_indices]
+    sums = quad[:, 0] + quad[:, 1]
+    top_left_idx = np.argmin(sums)
+    quad = np.roll(quad, -top_left_idx, axis=0)
+
+    return quad
+
+
+def restructured_boxes(
+    boxes: ndarray,
+    labels: List[str],
+    img_size: Tuple[int, int],
+    polygon_points: ndarray = None,
+) -> Boxes:
+    """
+    Restructure the given bounding boxes and labels based on the image size.
+
+    Args:
+        boxes (ndarray): A 2D array of bounding boxes with each box represented as [cls_id, score, xmin, ymin, xmax, ymax].
+        labels (List[str]): A list of class labels corresponding to the class ids.
+        img_size (Tuple[int, int]): A tuple representing the width and height of the image.
+        polygon_points (ndarray): A 2D array of polygon points with each point represented as [x, y].
+    Returns:
+        Boxes: A list of dictionaries, each containing 'cls_id', 'label', 'score', and 'coordinate' keys.
+    """
+    box_list = []
+    w, h = img_size
+
+    for idx, box in enumerate(boxes):
+        xmin, ymin, xmax, ymax = box[2:]
+        xmin = int(max(0, xmin))
+        ymin = int(max(0, ymin))
+        xmax = int(min(w, xmax))
+        ymax = int(min(h, ymax))
+        if xmax <= xmin or ymax <= ymin:
+            continue
+        res = {
+            "cls_id": int(box[0]),
+            "label": labels[int(box[0])],
+            "score": float(box[1]),
+            "coordinate": [xmin, ymin, xmax, ymax],
+            "order": idx + 1,
+        }
+        if polygon_points is not None:
+            polygon_point = polygon_points[idx]
+            if polygon_point is None:
+                continue
+            res["polygon_points"] = polygon_point
+        box_list.append(res)
+
+    return box_list
+
+
+def unclip_boxes(boxes, unclip_ratio=None):
+    """
+    Expand bounding boxes from (x1, y1, x2, y2) format using an unclipping ratio.
+
+    Parameters:
+    - boxes: np.ndarray of shape (N, 4), where each row is (x1, y1, x2, y2).
+    - unclip_ratio: tuple of (width_ratio, height_ratio), optional.
+
+    Returns:
+    - expanded_boxes: np.ndarray of shape (N, 4), where each row is (x1, y1, x2, y2).
+    """
+    if unclip_ratio is None:
+        return boxes
+
+    if isinstance(unclip_ratio, dict):
+        expanded_boxes = []
+        for box in boxes:
+            class_id, score, x1, y1, x2, y2 = box
+            if class_id in unclip_ratio:
+                width_ratio, height_ratio = unclip_ratio[class_id]
+
+                width = x2 - x1
+                height = y2 - y1
+
+                new_w = width * width_ratio
+                new_h = height * height_ratio
+                center_x = x1 + width / 2
+                center_y = y1 + height / 2
+
+                new_x1 = center_x - new_w / 2
+                new_y1 = center_y - new_h / 2
+                new_x2 = center_x + new_w / 2
+                new_y2 = center_y + new_h / 2
+
+                expanded_boxes.append([class_id, score, new_x1, new_y1, new_x2, new_y2])
+            else:
+                expanded_boxes.append(box)
+        return np.array(expanded_boxes)
+
+    else:
+        widths = boxes[:, 4] - boxes[:, 2]
+        heights = boxes[:, 5] - boxes[:, 3]
+
+        new_w = widths * unclip_ratio[0]
+        new_h = heights * unclip_ratio[1]
+        center_x = boxes[:, 2] + widths / 2
+        center_y = boxes[:, 3] + heights / 2
+
+        new_x1 = center_x - new_w / 2
+        new_y1 = center_y - new_h / 2
+        new_x2 = center_x + new_w / 2
+        new_y2 = center_y + new_h / 2
+        expanded_boxes = np.column_stack(
+            (boxes[:, 0], boxes[:, 1], new_x1, new_y1, new_x2, new_y2)
+        )
+        return expanded_boxes
+
+
+def make_valid(poly):
+    if not poly.is_valid:
+        poly = poly.buffer(0)
+    return poly
+
+
+def calculate_polygon_overlap_ratio(
+    polygon1: List[Tuple[int, int]],
+    polygon2: List[Tuple[int, int]],
+    mode: str = "union",
+) -> float:
+    """
+    Calculate the overlap ratio between two polygons.
+
+    Args:
+        polygon1 (List[Tuple[int, int]]): First polygon represented as a list of points.
+        polygon2 (List[Tuple[int, int]]): Second polygon represented as a list of points.
+        mode (str, optional): Overlap calculation mode. Defaults to "union".
+
+    Returns:
+        float: Overlap ratio value between 0 and 1.
+    """
+    try:
+        from shapely.geometry import Polygon
+    except ImportError:
+        raise ImportError("Please install Shapely library.")
+    poly1 = Polygon(polygon1)
+    poly2 = Polygon(polygon2)
+    poly1 = make_valid(poly1)
+    poly2 = make_valid(poly2)
+    intersection = poly1.intersection(poly2).area
+    union = poly1.union(poly2).area
+    if mode == "union":
+        return intersection / union
+    elif mode == "small":
+        small_area = min(poly1.area, poly2.area)
+        return intersection / small_area
+    elif mode == "large":
+        large_area = max(poly1.area, poly2.area)
+        return intersection / large_area
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+
+
+def calculate_bbox_area(bbox):
+    """Calculate bounding box area"""
+    x1, y1, x2, y2 = map(float, bbox)
+    area = abs((x2 - x1) * (y2 - y1))
+    return area
+
+
+def calculate_overlap_ratio(
+    bbox1: Union[np.ndarray, list, tuple],
+    bbox2: Union[np.ndarray, list, tuple],
+    mode="union",
+) -> float:
+    """
+    Calculate the overlap ratio between two bounding boxes using NumPy.
+
+    Args:
+        bbox1 (np.ndarray, list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
+        bbox2 (np.ndarray, list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
+        mode (str): The mode of calculation, either 'union', 'small', or 'large'.
+
+    Returns:
+        float: The overlap ratio value between the two bounding boxes
+    """
+    bbox1 = np.array(bbox1)
+    bbox2 = np.array(bbox2)
+
+    x_min_inter = np.maximum(bbox1[0], bbox2[0])
+    y_min_inter = np.maximum(bbox1[1], bbox2[1])
+    x_max_inter = np.minimum(bbox1[2], bbox2[2])
+    y_max_inter = np.minimum(bbox1[3], bbox2[3])
+
+    inter_width = np.maximum(0, x_max_inter - x_min_inter)
+    inter_height = np.maximum(0, y_max_inter - y_min_inter)
+
+    inter_area = inter_width * inter_height
+
+    bbox1_area = calculate_bbox_area(bbox1)
+    bbox2_area = calculate_bbox_area(bbox2)
+
+    if mode == "union":
+        ref_area = bbox1_area + bbox2_area - inter_area
+    elif mode == "small":
+        ref_area = np.minimum(bbox1_area, bbox2_area)
+    elif mode == "large":
+        ref_area = np.maximum(bbox1_area, bbox2_area)
+    else:
+        raise ValueError(
+            f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
+        )
+
+    if ref_area == 0:
+        return 0.0
+
+    return inter_area / ref_area
+
+
+def filter_boxes(
+    src_boxes: Dict[str, List[Dict]], layout_shape_mode: str
+) -> Dict[str, List[Dict]]:
+    """
+    Remove overlapping boxes from layout detection results based on a given overlap ratio.
+
+    Args:
+        boxes (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.
+
+    Returns:
+        Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
+    """
+    boxes = [box for box in src_boxes if box["label"] != "reference"]
+    dropped_indexes = set()
+
+    for i in range(len(boxes)):
+        x1, y1, x2, y2 = boxes[i]["coordinate"]
+        w, h = x2 - x1, y2 - y1
+        if w < 6 or h < 6:
+            dropped_indexes.add(i)
+        for j in range(i + 1, len(boxes)):
+            if i in dropped_indexes or j in dropped_indexes:
+                continue
+            overlap_ratio = calculate_overlap_ratio(
+                boxes[i]["coordinate"], boxes[j]["coordinate"], "small"
+            )
+            if (
+                boxes[i]["label"] == "inline_formula"
+                or boxes[j]["label"] == "inline_formula"
+            ):
+                if overlap_ratio > 0.5:
+                    if boxes[i]["label"] == "inline_formula":
+                        dropped_indexes.add(i)
+                    if boxes[j]["label"] == "inline_formula":
+                        dropped_indexes.add(j)
+                    continue
+            if overlap_ratio > 0.7:
+                if layout_shape_mode != "rect" and "polygon_points" in boxes[i]:
+                    poly_overlap_ratio = calculate_polygon_overlap_ratio(
+                        boxes[i]["polygon_points"], boxes[j]["polygon_points"], "small"
+                    )
+                    if poly_overlap_ratio < 0.7:
+                        continue
+                box_area_i = calculate_bbox_area(boxes[i]["coordinate"])
+                box_area_j = calculate_bbox_area(boxes[j]["coordinate"])
+                if (
+                    boxes[i]["label"] == "image" or boxes[j]["label"] == "image"
+                ) and boxes[i]["label"] != boxes[j]["label"]:
+                    continue
+                if box_area_i >= box_area_j:
+                    dropped_indexes.add(j)
+                else:
+                    dropped_indexes.add(i)
+    out_boxes = [box for idx, box in enumerate(boxes) if idx not in dropped_indexes]
+    return out_boxes
+
+
+def update_order_index(boxes: List[Dict], skip_order_labels: List[str]):
+    """
+    Update the 'order_index' field of each box in the provided list of boxes.
+
+    Args:
+        boxes (List[Dict]): A list of boxes, where each box is represented as a dictionary with an 'order_index' field.
+
+    Returns:
+        None. The  function updates the 'order_index' field of each box in the input list.
+    """
+    order_index = 1
+    for box in boxes:
+        label = box["label"]
+        if label not in skip_order_labels:
+            box["order"] = order_index
+            order_index += 1
+        else:
+            box["order"] = None
+    return boxes
+
+
+def find_label_position(box, polygon_points, text_w, text_h, max_shift=50):
+    try:
+        from shapely.geometry import Polygon
+    except ImportError:
+        raise ImportError("Please install Shapely library.")
+    poly = Polygon(polygon_points)
+    min_x = min([p[0] for p in polygon_points])
+    min_y = min([p[1] for p in polygon_points])
+    for dy in range(max_shift):
+        x1, y1 = min_x, min_y + dy
+        x2, y2 = x1 + text_w, y1 + text_h
+        label_rect = box(x1, y1, x2, y2)
+        if poly.intersects(label_rect):
+            return int(x1), int(y1)
+
+    return int(min_x), int(min_y)
+
+
+@benchmark.timeit
+class LayoutAnalysisProcess:
+    """Save Result Transform
+
+    This class is responsible for post-processing detection results, including
+    thresholding, non-maximum suppression (NMS), and restructuring the boxes
+    based on the input type (normal or rotated object detection).
+    """
+
+    def __init__(
+        self, labels: Optional[List[str]] = None, scale_size: Optional[List[int]] = None
+    ) -> None:
+        """Initialize the DetPostProcess class.
+
+        Args:
+            threshold (float, optional): The threshold to apply to the detection scores. Defaults to 0.5.
+            labels (Optional[List[str]], optional): The list of labels for the detection categories. Defaults to None.
+            layout_postprocess (bool, optional): Whether to apply layout post-processing. Defaults to False.
+        """
+        super().__init__()
+        self.labels = labels
+        self.scale_size = scale_size
+
+    def apply(
+        self,
+        boxes: ndarray,
+        img_size: Tuple[int, int],
+        threshold: Union[float, dict],
+        layout_nms: Optional[bool],
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]],
+        layout_merge_bboxes_mode: Optional[Union[str, dict]],
+        masks: Optional[ndarray] = None,
+        layout_shape_mode: Optional[str] = "auto",
+    ) -> Boxes:
+        """Apply post-processing to the detection boxes.
+
+        Args:
+            boxes (ndarray): The input detection boxes with scores.
+            img_size (tuple): The original image size.
+
+        Returns:
+            Boxes: The post-processed detection boxes.
+        """
+        if layout_shape_mode == "rect":
+            masks = None
+        boxes[:, 2:6] = np.round(boxes[:, 2:6]).astype(int)
+        if isinstance(threshold, float):
+            expect_boxes = (boxes[:, 1] > threshold) & (boxes[:, 0] > -1)
+            boxes = boxes[expect_boxes, :]
+            if masks is not None:
+                masks = masks[expect_boxes, ...]
+        elif isinstance(threshold, dict):
+            category_filtered_boxes = []
+            if masks is not None:
+                category_filtered_masks = []
+            for cat_id in np.unique(boxes[:, 0]):
+                category_boxes = boxes[boxes[:, 0] == cat_id]
+                if masks is not None:
+                    category_masks = masks[boxes[:, 0] == cat_id]
+                category_threshold = threshold.get(int(cat_id), 0.5)
+                selected_indices = (category_boxes[:, 1] > category_threshold) & (
+                    category_boxes[:, 0] > -1
+                )
+                if masks is not None:
+                    category_masks = category_masks[selected_indices]
+                    category_filtered_masks.append(category_masks)
+                category_filtered_boxes.append(category_boxes[selected_indices])
+            boxes = (
+                np.vstack(category_filtered_boxes)
+                if category_filtered_boxes
+                else np.array([])
+            )
+            if masks is not None:
+                masks = (
+                    np.concatenate(category_filtered_masks)
+                    if category_filtered_masks
+                    else np.array([])
+                )
+
+        if layout_nms:
+            selected_indices = nms(boxes[:, :6], iou_same=0.6, iou_diff=0.98)
+            boxes = np.array(boxes[selected_indices])
+            if masks is not None:
+                masks = [masks[i] for i in selected_indices]
+
+        filter_large_image = True
+        # boxes.shape[1] == 6 is object detection, 7 is new ordered object detection, 8 is ordered object detection
+        if filter_large_image and len(boxes) > 1 and boxes.shape[1] in [6, 7, 8]:
+            if img_size[0] > img_size[1]:
+                area_thres = 0.82
+            else:
+                area_thres = 0.93
+            image_index = self.labels.index("image") if "image" in self.labels else None
+            img_area = img_size[0] * img_size[1]
+            filtered_boxes = []
+            filtered_masks = []
+            for idx, box in enumerate(boxes):
+                (
+                    label_index,
+                    score,
+                    xmin,
+                    ymin,
+                    xmax,
+                    ymax,
+                ) = box[:6]
+                if label_index == image_index:
+                    xmin = max(0, xmin)
+                    ymin = max(0, ymin)
+                    xmax = min(img_size[0], xmax)
+                    ymax = min(img_size[1], ymax)
+                    box_area = (xmax - xmin) * (ymax - ymin)
+                    if box_area <= area_thres * img_area:
+                        filtered_boxes.append(box)
+                        if masks is not None:
+                            filtered_masks.append(masks[idx])
+                else:
+                    filtered_boxes.append(box)
+                    if masks is not None:
+                        filtered_masks.append(masks[idx])
+            if len(filtered_boxes) == 0:
+                filtered_boxes = boxes
+                if masks is not None:
+                    filtered_masks = masks
+            boxes = np.array(filtered_boxes)
+            if masks is not None:
+                masks = filtered_masks
+
+        if layout_merge_bboxes_mode:
+            formula_index = (
+                self.labels.index("formula") if "formula" in self.labels else None
+            )
+            if isinstance(layout_merge_bboxes_mode, str):
+                assert layout_merge_bboxes_mode in [
+                    "union",
+                    "large",
+                    "small",
+                ], f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'], but got {layout_merge_bboxes_mode}"
+
+                if layout_merge_bboxes_mode == "union":
+                    pass
+                else:
+                    contains_other, contained_by_other = check_containment(
+                        boxes[:, :6], formula_index
+                    )
+                    if layout_merge_bboxes_mode == "large":
+                        boxes = boxes[contained_by_other == 0]
+                        if masks is not None:
+                            masks = [
+                                mask
+                                for i, mask in enumerate(masks)
+                                if contained_by_other[i] == 0
+                            ]
+                    elif layout_merge_bboxes_mode == "small":
+                        boxes = boxes[(contains_other == 0) | (contained_by_other == 1)]
+                        if masks is not None:
+                            masks = [
+                                mask
+                                for i, mask in enumerate(masks)
+                                if (contains_other[i] == 0)
+                                | (contained_by_other[i] == 1)
+                            ]
+            elif isinstance(layout_merge_bboxes_mode, dict):
+                keep_mask = np.ones(len(boxes), dtype=bool)
+                for category_index, layout_mode in layout_merge_bboxes_mode.items():
+                    assert layout_mode in [
+                        "union",
+                        "large",
+                        "small",
+                    ], f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'], but got {layout_mode}"
+                    if layout_mode == "union":
+                        pass
+                    else:
+                        if layout_mode == "large":
+                            contains_other, contained_by_other = check_containment(
+                                boxes[:, :6],
+                                formula_index,
+                                category_index,
+                                mode=layout_mode,
+                            )
+                            # Remove boxes that are contained by other boxes
+                            keep_mask &= contained_by_other == 0
+                        elif layout_mode == "small":
+                            contains_other, contained_by_other = check_containment(
+                                boxes[:, :6],
+                                formula_index,
+                                category_index,
+                                mode=layout_mode,
+                            )
+                            # Keep boxes that do not contain others or are contained by others
+                            keep_mask &= (contains_other == 0) | (
+                                contained_by_other == 1
+                            )
+                boxes = boxes[keep_mask]
+                if masks is not None:
+                    masks = [mask for i, mask in enumerate(masks) if keep_mask[i]]
+
+        if boxes.size == 0:
+            return np.array([])
+
+        if boxes.shape[1] == 8:
+            # Sort boxes by their order
+            sorted_idx = np.lexsort((-boxes[:, 7], boxes[:, 6]))
+            sorted_boxes = boxes[sorted_idx]
+            boxes = sorted_boxes[:, :6]
+            if masks is not None:
+                sorted_masks = [masks[i] for i in sorted_idx]
+                masks = sorted_masks
+
+        if boxes.shape[1] == 7:
+            # Sort boxes by their order
+            sorted_idx = np.argsort(boxes[:, 6])
+            sorted_boxes = boxes[sorted_idx]
+            boxes = sorted_boxes[:, :6]
+            if masks is not None:
+                sorted_masks = [masks[i] for i in sorted_idx]
+                masks = sorted_masks
+
+        polygon_points = None
+        if masks is not None:
+            scale_ratio = [h / s for h, s in zip(self.scale_size, img_size)]
+            polygon_points = extract_polygon_points_by_masks(
+                boxes, np.array(masks), scale_ratio, layout_shape_mode
+            )
+
+        if layout_unclip_ratio:
+            if isinstance(layout_unclip_ratio, float):
+                layout_unclip_ratio = (layout_unclip_ratio, layout_unclip_ratio)
+            elif isinstance(layout_unclip_ratio, (tuple, list)):
+                assert (
+                    len(layout_unclip_ratio) == 2
+                ), f"The length of `layout_unclip_ratio` should be 2."
+            elif isinstance(layout_unclip_ratio, dict):
+                pass
+            else:
+                raise ValueError(
+                    f"The type of `layout_unclip_ratio` must be float, Tuple[float, float] or  Dict[int, Tuple[float, float]], but got {type(layout_unclip_ratio)}."
+                )
+            boxes = unclip_boxes(boxes, layout_unclip_ratio)
+
+        if boxes.shape[1] == 6:
+            """For Normal Object Detection"""
+            boxes = restructured_boxes(boxes, self.labels, img_size, polygon_points)
+        else:
+            """Unexpected Input Box Shape"""
+            raise ValueError(
+                f"The shape of boxes should be 6 or 10, instead of {boxes.shape[1]}"
+            )
+        return boxes
+
+    def __call__(
+        self,
+        batch_outputs: List[dict],
+        datas: List[dict],
+        threshold: Optional[Union[float, dict]] = None,
+        layout_nms: Optional[bool] = None,
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
+        layout_merge_bboxes_mode: Optional[str] = None,
+        layout_shape_mode: Optional[str] = None,
+        filter_overlap_boxes: Optional[bool] = None,
+        skip_order_labels: Optional[List[str]] = None,
+    ) -> List[Boxes]:
+        """Apply the post-processing to a batch of outputs.
+
+        Args:
+            batch_outputs (List[dict]): The list of detection outputs.
+            datas (List[dict]): The list of input data.
+
+        Returns:
+            List[Boxes]: The list of post-processed detection boxes.
+        """
+        outputs = []
+        for idx, (data, output) in enumerate(zip(datas, batch_outputs)):
+            if "masks" in output:
+                masks = output["masks"]
+            else:
+                layout_shape_mode = "rect"
+                if idx == 0 and layout_shape_mode not in ["rect", "auto"]:
+                    logging.warning(
+                        f"The model you are using does not support polygon output, but the layout_shape_mode is specified as {layout_shape_mode}, which will be set to 'rect'"
+                    )
+                masks = None
+            boxes = self.apply(
+                output["boxes"],
+                data["ori_img_size"],
+                threshold,
+                layout_nms,
+                layout_unclip_ratio,
+                layout_merge_bboxes_mode,
+                masks,
+                layout_shape_mode,
+            )
+            if filter_overlap_boxes:
+                boxes = filter_boxes(boxes, layout_shape_mode)
+            skip_order_labels = (
+                skip_order_labels
+                if skip_order_labels is not None
+                else SKIP_ORDER_LABELS
+            )
+            boxes = update_order_index(boxes, skip_order_labels)
+            outputs.append(boxes)
+        return outputs
diff --git a/paddlex/inference/models/layout_analysis/result.py b/paddlex/inference/models/layout_analysis/result.py
new file mode 100644
index 0000000000..cabb9ff5cb
--- /dev/null
+++ b/paddlex/inference/models/layout_analysis/result.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import List
+
+import numpy as np
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+
+from ....utils.deps import function_requires_deps, is_dep_available
+from ....utils.fonts import PINGFANG_FONT
+from ...common.result import BaseCVResult, JsonMixin
+from ...utils.color_map import font_colormap, get_colormap
+
+if is_dep_available("opencv-contrib-python"):
+    import cv2
+
+
+def draw_box(img: Image.Image, boxes: List[dict]) -> Image.Image:
+    """
+    Args:
+        img (PIL.Image.Image): PIL image
+        boxes (list): a list of dictionaries representing detection box information.
+    Returns:
+        img (PIL.Image.Image): visualized image
+    """
+    font_size = int(0.018 * int(img.width)) + 2
+    font = ImageFont.truetype(PINGFANG_FONT.path, font_size, encoding="utf-8")
+
+    draw_thickness = int(max(img.size) * 0.002)
+    draw = ImageDraw.Draw(img)
+    label2color = {}
+    catid2fontcolor = {}
+    color_list = get_colormap(rgb=True)
+
+    for i, dt in enumerate(boxes):
+        label, bbox, score = dt["label"], dt["coordinate"], dt["score"]
+        if label not in label2color:
+            color_index = i % len(color_list)
+            label2color[label] = color_list[color_index]
+            catid2fontcolor[label] = font_colormap(color_index)
+        color = tuple(label2color[label])
+        font_color = tuple(catid2fontcolor[label])
+
+        if len(bbox) == 4:
+            # draw bbox of normal object detection
+            xmin, ymin, xmax, ymax = bbox
+            rectangle = [
+                (xmin, ymin),
+                (xmin, ymax),
+                (xmax, ymax),
+                (xmax, ymin),
+                (xmin, ymin),
+            ]
+        else:
+            raise ValueError(
+                f"Only support bbox format of [xmin,ymin,xmax,ymax] or [x1,y1,x2,y2,x3,y3,x4,y4], got bbox of shape {len(bbox)}."
+            )
+
+        # draw bbox
+        draw.line(
+            rectangle,
+            width=draw_thickness,
+            fill=color,
+        )
+
+        # draw label
+        text = "{} {:.2f}".format(dt["label"], score)
+        if tuple(map(int, PIL.__version__.split("."))) <= (10, 0, 0):
+            tw, th = draw.textsize(text, font=font)
+        else:
+            left, top, right, bottom = draw.textbbox((0, 0), text, font)
+            tw, th = right - left, bottom - top + 4
+        if ymin < th:
+            draw.rectangle([(xmin, ymin), (xmin + tw + 4, ymin + th + 1)], fill=color)
+            draw.text((xmin + 2, ymin - 2), text, fill=font_color, font=font)
+        else:
+            draw.rectangle([(xmin, ymin - th), (xmin + tw + 4, ymin + 1)], fill=color)
+            draw.text((xmin + 2, ymin - th - 2), text, fill=font_color, font=font)
+
+        text_position = (bbox[2] + 2, bbox[1] - font_size // 2)
+        if int(img.width) - bbox[2] < font_size:
+            text_position = (
+                int(bbox[2] - font_size * 1.1),
+                bbox[1] - font_size // 2,
+            )
+        draw.text(text_position, str(i + 1), font=font, fill="red")
+
+    return img
+
+
+@function_requires_deps("opencv-contrib-python")
+def restore_to_draw_masks(img_size, boxes):
+    """
+    Restores extracted masks to the original shape and draws them on a blank image.
+
+    """
+    restored_masks = []
+
+    for i, box_info in enumerate(boxes):
+        restored_mask = np.zeros(img_size, dtype=np.uint8)
+        polygon = np.array(box_info["polygon_points"], dtype=np.int32)
+        polygon = polygon.reshape((-1, 1, 2))  # shape: (N, 1, 2)
+        cv2.fillPoly(restored_mask, [polygon], 1)
+        restored_masks.append(restored_mask)
+
+    return np.array(restored_masks)
+
+
+def draw_mask(im, boxes, img_size):
+    """
+    Args:
+        im (PIL.Image.Image): PIL image
+        boxes (list): a list of dicts representing detection box information.
+    Returns:
+        img (PIL.Image.Image): visualized image
+    """
+    color_list = get_colormap(rgb=True)
+    alpha = 0.5
+
+    im = np.array(im).astype("float32")
+    clsid2color = {}
+
+    np_masks = restore_to_draw_masks(img_size, boxes)
+    im_h, im_w = im.shape[:2]
+    np_masks = np_masks[:, :im_h, :im_w]
+
+    # draw mask
+    for i, mask in enumerate(np_masks):
+        clsid = int(boxes[i]["cls_id"])
+        if clsid not in clsid2color:
+            color_index = i % len(color_list)
+            clsid2color[clsid] = np.array(color_list[color_index])
+        color_mask = clsid2color[clsid]
+        idx = np.nonzero(mask)
+        im[idx[0], idx[1], :] = (1.0 - alpha) * im[
+            idx[0], idx[1], :
+        ] + alpha * color_mask
+
+    img = Image.fromarray(np.uint8(im))
+    font_size = int(0.018 * img.width) + 2
+    font = ImageFont.truetype(PINGFANG_FONT.path, font_size, encoding="utf-8")
+    draw = ImageDraw.Draw(img)
+    label2color = {}
+    catid2fontcolor = {}
+
+    for i, box_info in enumerate(boxes):
+        label = box_info["label"]
+        score = box_info["score"]
+        if label not in label2color:
+            color_index = i % len(color_list)
+            label2color[label] = color_list[color_index]
+            catid2fontcolor[label] = font_colormap(color_index)
+        color = tuple(label2color[label])
+        font_color = tuple(catid2fontcolor[label])
+
+        polygon_points = box_info["polygon_points"]
+
+        image_left_top = (0, 0)
+        image_right_top = (img.width, 0)
+        left_top = min(
+            polygon_points,
+            key=lambda p: (
+                (p[0] - image_left_top[0]) ** 2 + (p[1] - image_left_top[1]) ** 2
+            ),
+        )
+        right_top = min(
+            polygon_points,
+            key=lambda p: (
+                (p[0] - image_right_top[0]) ** 2 + (p[1] - image_right_top[1]) ** 2
+            ),
+        )
+
+        # label
+        text = "{} {:.2f}".format(label, score)
+        if tuple(map(int, PIL.__version__.split("."))) <= (10, 0, 0):
+            tw, th = draw.textsize(text, font=font)
+        else:
+            left, top, right, bottom = draw.textbbox((0, 0), text, font)
+            tw, th = right - left, bottom - top + 4
+        lx, ly = left_top
+        if ly < th:
+            draw.rectangle([(lx, ly), (lx + tw + 4, ly + th + 1)], fill=color)
+            draw.text((lx + 2, ly - 2), text, fill=font_color, font=font)
+        else:
+            draw.rectangle([(lx, ly - th), (lx + tw + 4, ly + 1)], fill=color)
+            draw.text((lx + 2, ly - th - 2), text, fill=font_color, font=font)
+
+        # order
+        order = box_info.get("order", None)
+        if order:
+            order_text = str(order)
+            rx, ry = right_top
+            text_position = (rx + 2, ry - font_size // 2)
+            if int(img.width) - rx < font_size:
+                text_position = (
+                    int(rx - font_size * 1.1),
+                    ry - font_size // 2,
+                )
+            draw.text(text_position, order_text, font=font, fill="red")
+
+    return img
+
+
+class LayoutAnalysisResult(BaseCVResult):
+
+    def _to_img(self) -> Image.Image:
+        """apply"""
+        boxes = self["boxes"]
+        image = Image.fromarray(self["input_img"][..., ::-1])
+        ori_img_size = list(image.size)[::-1]
+        if len(boxes) > 0 and "polygon_points" in boxes[0]:
+            image = draw_mask(image, boxes, ori_img_size)
+        else:
+            image = draw_box(image, boxes)
+        return {"res": image}
+
+    def _to_str(self, *args, **kwargs):
+        data = copy.deepcopy(self)
+        data.pop("input_img")
+        return JsonMixin._to_str(data, *args, **kwargs)
+
+    def _to_json(self, *args, **kwargs):
+        data = copy.deepcopy(self)
+        data.pop("input_img")
+        return JsonMixin._to_json(data, *args, **kwargs)
diff --git a/paddlex/inference/models/layout_analysis/utils.py b/paddlex/inference/models/layout_analysis/utils.py
new file mode 100644
index 0000000000..b04831c32b
--- /dev/null
+++ b/paddlex/inference/models/layout_analysis/utils.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+STATIC_SHAPE_MODEL_LIST = [
+    "PP-DocLayoutV2",
+    "PP-DocLayoutV3",
+]
diff --git a/paddlex/inference/models/multilingual_speech_recognition/processors.py b/paddlex/inference/models/multilingual_speech_recognition/processors.py
index ae5c7acade..28bdbe5688 100644
--- a/paddlex/inference/models/multilingual_speech_recognition/processors.py
+++ b/paddlex/inference/models/multilingual_speech_recognition/processors.py
@@ -22,11 +22,7 @@
 import paddle
 
 from ....utils.deps import function_requires_deps, is_dep_available
-from ...utils.benchmark import (
-    benchmark,
-    get_inference_operations,
-    set_inference_operations,
-)
+from ...utils.benchmark import add_inference_operations, benchmark
 from ..common.tokenizer import GPTTokenizer
 
 if is_dep_available("soundfile"):
@@ -1836,7 +1832,7 @@ def install_hooks(layer: paddle.nn.Layer):
         return cache, hooks
 
     detect_language = detect_language
-    set_inference_operations(get_inference_operations() + ["speech_transcribe"])
+    add_inference_operations("speech_transcribe")
     transcribe = benchmark.timeit_with_options(name="speech_transcribe")(transcribe)
     decode = decode
 
diff --git a/paddlex/inference/models/object_detection/modeling/__init__.py b/paddlex/inference/models/object_detection/modeling/__init__.py
new file mode 100644
index 0000000000..acc3858aaf
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .rt_detr import RTDETR
diff --git a/paddlex/inference/models/object_detection/modeling/rt_detr.py b/paddlex/inference/models/object_detection/modeling/rt_detr.py
new file mode 100644
index 0000000000..9f15589214
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rt_detr.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn.functional as F
+
+from ...common.transformers.transformers import (
+    BatchNormHFStateDictMixin,
+    PretrainedConfig,
+    PretrainedModel,
+)
+from .rtdetrl_modules.detr_head import DINOHead
+from .rtdetrl_modules.hgnet_v2 import PPHGNetV2
+from .rtdetrl_modules.hybrid_encoder import HybridEncoder, TransformerLayer
+from .rtdetrl_modules.modules.detr_loss import DINOLoss
+from .rtdetrl_modules.modules.matchers import HungarianMatcher
+from .rtdetrl_modules.modules.utils import bbox_cxcywh_to_xyxy
+from .rtdetrl_modules.rtdetr_transformer import RTDETRTransformer
+
+__all__ = ["RTDETR"]
+
+
+class DETRPostProcess(object):
+    __shared__ = ["num_classes", "use_focal_loss", "with_mask"]
+    __inject__ = []
+
+    def __init__(
+        self,
+        num_classes=80,
+        num_top_queries=100,
+        dual_queries=False,
+        dual_groups=0,
+        use_focal_loss=False,
+        with_mask=False,
+        mask_stride=4,
+        mask_threshold=0.5,
+        use_avg_mask_score=False,
+        bbox_decode_type="origin",
+    ):
+        super(DETRPostProcess, self).__init__()
+        assert bbox_decode_type in ["origin", "pad"]
+
+        self.num_classes = num_classes
+        self.num_top_queries = num_top_queries
+        self.dual_queries = dual_queries
+        self.dual_groups = dual_groups
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.mask_stride = mask_stride
+        self.mask_threshold = mask_threshold
+        self.use_avg_mask_score = use_avg_mask_score
+        self.bbox_decode_type = bbox_decode_type
+
+    def _mask_postprocess(self, mask_pred, score_pred):
+        mask_score = F.sigmoid(mask_pred)
+        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
+        if self.use_avg_mask_score:
+            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
+                mask_pred.sum([-2, -1]) + 1e-6
+            )
+            score_pred *= avg_mask_score
+
+        return mask_pred.flatten(0, 1).astype("int32"), score_pred
+
+    def __call__(self, head_out, im_shape, scale_factor, pad_shape):
+        """
+        Decode the bbox and mask.
+
+        Args:
+            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
+            im_shape (Tensor): The shape of the input image without padding.
+            scale_factor (Tensor): The scale factor of the input image.
+            pad_shape (Tensor): The shape of the input image with padding.
+        Returns:
+            bbox_pred (Tensor): The output prediction with shape [N, 6], including
+                labels, scores and bboxes. The size of bboxes are corresponding
+                to the input image, the bboxes may be used in other branch.
+            bbox_num (Tensor): The number of prediction boxes of each batch with
+                shape [bs], and is N.
+        """
+        bboxes, logits, masks = head_out
+        if self.dual_queries:
+            num_queries = logits.shape[1]
+            logits, bboxes = (
+                logits[:, : int(num_queries // (self.dual_groups + 1)), :],
+                bboxes[:, : int(num_queries // (self.dual_groups + 1)), :],
+            )
+
+        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
+        # calculate the original shape of the image
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
+        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
+        if self.bbox_decode_type == "pad":
+            # calculate the shape of the image with padding
+            out_shape = pad_shape / im_shape * origin_shape
+            out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
+        elif self.bbox_decode_type == "origin":
+            out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
+        else:
+            raise Exception(f"Wrong `bbox_decode_type`: {self.bbox_decode_type}.")
+        bbox_pred *= out_shape
+
+        scores = (
+            F.sigmoid(logits) if self.use_focal_loss else F.softmax(logits)[:, :, :-1]
+        )
+
+        if not self.use_focal_loss:
+            scores, labels = scores.max(-1), scores.argmax(-1)
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = paddle.topk(scores, self.num_top_queries, axis=-1)
+                batch_ind = (
+                    paddle.arange(end=scores.shape[0])
+                    .unsqueeze(-1)
+                    .tile([1, self.num_top_queries])
+                )
+                index = paddle.stack([batch_ind, index], axis=-1)
+                labels = paddle.gather_nd(labels, index)
+                bbox_pred = paddle.gather_nd(bbox_pred, index)
+        else:
+            scores, index = paddle.topk(
+                scores.flatten(1), self.num_top_queries, axis=-1
+            )
+            labels = index % self.num_classes
+            index = index // self.num_classes
+            batch_ind = (
+                paddle.arange(end=scores.shape[0])
+                .unsqueeze(-1)
+                .tile([1, self.num_top_queries])
+            )
+            index = paddle.stack([batch_ind, index], axis=-1)
+            bbox_pred = paddle.gather_nd(bbox_pred, index)
+
+        mask_pred = None
+        if self.with_mask:
+            assert masks is not None
+            assert masks.shape[0] == 1
+            masks = paddle.gather_nd(masks, index)
+            if self.bbox_decode_type == "pad":
+                masks = F.interpolate(
+                    masks,
+                    scale_factor=self.mask_stride,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                # TODO: Support prediction with bs>1.
+                # remove padding for input image
+                h, w = im_shape.astype("int32")[0]
+                masks = masks[..., :h, :w]
+            # get pred_mask in the original resolution.
+            img_h = img_h[0].astype("int32")
+            img_w = img_w[0].astype("int32")
+            masks = F.interpolate(
+                masks, size=[img_h, img_w], mode="bilinear", align_corners=False
+            )
+            mask_pred, scores = self._mask_postprocess(masks, scores)
+
+        bbox_pred = paddle.concat(
+            [labels.unsqueeze(-1).astype("float32"), scores.unsqueeze(-1), bbox_pred],
+            axis=-1,
+        )
+        bbox_num = paddle.to_tensor(self.num_top_queries, dtype="int32").tile(
+            [bbox_pred.shape[0]]
+        )
+        bbox_pred = bbox_pred.reshape([-1, 6])
+        return bbox_pred, bbox_num, mask_pred
+
+
+class RTDETRConfig(PretrainedConfig):
+    def __init__(
+        self,
+        backbone,
+        HybridEncoder,
+        RTDETRTransformer,
+        DINOHead,
+        DETRPostProcess,
+    ):
+        if backbone["name"] == "PPHGNetV2":
+            self.arch = backbone["arch"]
+            self.return_idx = backbone["return_idx"]
+            self.freeze_stem_only = backbone["freeze_stem_only"]
+            self.freeze_at = backbone["freeze_at"]
+            self.freeze_norm = backbone["freeze_norm"]
+            self.lr_mult_list = backbone["lr_mult_list"]
+        else:
+            raise RuntimeError(
+                f"There is no dynamic graph implementation for backbone {backbone['name']}."
+            )
+        self.hidden_dim = HybridEncoder["hidden_dim"]
+        self.use_encoder_idx = HybridEncoder["use_encoder_idx"]
+        self.num_encoder_layers = HybridEncoder["num_encoder_layers"]
+        self.el_d_model = HybridEncoder["encoder_layer"]["d_model"]
+        self.el_nhead = HybridEncoder["encoder_layer"]["nhead"]
+        self.el_dim_feedforward = HybridEncoder["encoder_layer"]["dim_feedforward"]
+        self.el_dropout = HybridEncoder["encoder_layer"]["dropout"]
+        self.el_activation = HybridEncoder["encoder_layer"]["activation"]
+        self.expansion = HybridEncoder["expansion"]
+        self.tf_num_queries = RTDETRTransformer["num_queries"]
+        self.tf_position_embed_type = RTDETRTransformer["position_embed_type"]
+        self.tf_feat_strides = RTDETRTransformer["feat_strides"]
+        self.tf_num_levels = RTDETRTransformer["num_levels"]
+        self.tf_nhead = RTDETRTransformer["nhead"]
+        self.tf_num_decoder_layers = RTDETRTransformer["num_decoder_layers"]
+        self.tf_backbone_feat_channels = RTDETRTransformer["backbone_feat_channels"]
+        self.tf_dim_feedforward = RTDETRTransformer["dim_feedforward"]
+        self.tf_dropout = RTDETRTransformer["dropout"]
+        self.tf_activation = RTDETRTransformer["activation"]
+        self.tf_num_denoising = RTDETRTransformer["num_denoising"]
+        self.tf_label_noise_ratio = RTDETRTransformer["label_noise_ratio"]
+        self.tf_box_noise_scale = RTDETRTransformer["box_noise_scale"]
+        self.tf_learnt_init_query = RTDETRTransformer["learnt_init_query"]
+        self.loss_coeff = DINOHead["loss"]["loss_coeff"]
+        self.aux_loss = DINOHead["loss"]["aux_loss"]
+        self.use_vfl = DINOHead["loss"]["use_vfl"]
+        self.matcher_coeff = DINOHead["loss"]["matcher"]["matcher_coeff"]
+        self.num_top_queries = DETRPostProcess["num_top_queries"]
+        self.use_focal_loss = DETRPostProcess["use_focal_loss"]
+        self.tensor_parallel_degree = 1
+
+
+class RTDETR(BatchNormHFStateDictMixin, PretrainedModel):
+
+    config_class = RTDETRConfig
+
+    def __init__(self, config: RTDETRConfig):
+        super().__init__(config)
+
+        self.backbone = PPHGNetV2(
+            arch=self.config.arch,
+            lr_mult_list=self.config.lr_mult_list,
+            return_idx=self.config.return_idx,
+            freeze_stem_only=self.config.freeze_stem_only,
+            freeze_at=self.config.freeze_at,
+            freeze_norm=self.config.freeze_norm,
+        )
+        self.neck = HybridEncoder(
+            hidden_dim=self.config.hidden_dim,
+            use_encoder_idx=self.config.use_encoder_idx,
+            num_encoder_layers=self.config.num_encoder_layers,
+            encoder_layer=TransformerLayer(
+                d_model=self.config.el_d_model,
+                nhead=self.config.el_nhead,
+                dim_feedforward=self.config.el_dim_feedforward,
+                dropout=self.config.el_dropout,
+                activation=self.config.el_activation,
+            ),
+            expansion=self.config.expansion,
+        )
+        self.transformer = RTDETRTransformer(
+            num_queries=self.config.tf_num_queries,
+            position_embed_type=self.config.tf_position_embed_type,
+            feat_strides=self.config.tf_feat_strides,
+            backbone_feat_channels=self.config.tf_backbone_feat_channels,
+            num_levels=self.config.tf_num_levels,
+            nhead=self.config.tf_nhead,
+            num_decoder_layers=self.config.tf_num_decoder_layers,
+            dim_feedforward=self.config.tf_dim_feedforward,
+            dropout=self.config.tf_dropout,
+            activation=self.config.tf_activation,
+            num_denoising=self.config.tf_num_denoising,
+            label_noise_ratio=self.config.tf_label_noise_ratio,
+            box_noise_scale=self.config.tf_box_noise_scale,
+            learnt_init_query=self.config.tf_learnt_init_query,
+        )
+        self.head = DINOHead(
+            loss=DINOLoss(
+                loss_coeff=self.config.loss_coeff,
+                aux_loss=self.config.aux_loss,
+                use_vfl=self.config.use_vfl,
+                matcher=HungarianMatcher(
+                    matcher_coeff=self.config.matcher_coeff,
+                ),
+            )
+        )
+        self.post_process = DETRPostProcess(
+            num_top_queries=self.config.num_top_queries,
+            use_focal_loss=self.config.use_focal_loss,
+        )
+
+    def forward(self, inputs):
+        x = paddle.to_tensor(inputs[1])
+        x = self.backbone(x)
+        x_neck = self.neck(x)
+        x = self.transformer(x_neck)
+        preds = self.head(x, x_neck)
+        bbox, bbox_num, mask = self.post_process(
+            preds,
+            paddle.to_tensor(inputs[0]),
+            paddle.to_tensor(inputs[2]),
+            inputs[1][2:].shape,
+        )
+        output = [bbox, bbox_num]
+        return output
+
+    def get_transpose_weight_keys(self):
+        need_to_transpose = []
+        all_weight_keys = []
+        for name, param in self.neck.named_parameters():
+            all_weight_keys.append("neck." + name)
+        for name, param in self.transformer.named_parameters():
+            all_weight_keys.append("transformer." + name)
+        for i in range(len(all_weight_keys)):
+            if ("out_proj" in all_weight_keys[i]) and (
+                "bias" not in all_weight_keys[i]
+            ):
+                need_to_transpose.append(all_weight_keys[i])
+        return need_to_transpose
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/detr_head.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/detr_head.py
new file mode 100644
index 0000000000..7a7ee374df
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/detr_head.py
@@ -0,0 +1,729 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .modules.detr_ops import inverse_sigmoid
+from .modules.initializer import constant_, linear_init_
+
+__all__ = ["DETRHead", "DeformableDETRHead", "DINOHead", "MaskDINOHead"]
+
+
+def get_activation(name="LeakyReLU"):
+    if name == "silu":
+        module = nn.Silu()
+    elif name == "relu":
+        module = nn.ReLU()
+    elif name in ["LeakyReLU", "leakyrelu", "lrelu"]:
+        module = nn.LeakyReLU(0.1)
+    elif name is None:
+        module = nn.Identity()
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class MLP(nn.Layer):
+    """This code is based on
+    https://github.com/facebookresearch/detr/blob/main/models/detr.py
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act="relu"):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.act = get_activation(act)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MultiHeadAttentionMap(nn.Layer):
+    """This code is based on
+    https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+    This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
+    """
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform()
+        )
+        bias_attr = (
+            paddle.framework.ParamAttr(initializer=paddle.nn.initializer.Constant())
+            if bias
+            else False
+        )
+
+        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
+        self.k_proj = nn.Conv2D(
+            query_dim, hidden_dim, 1, weight_attr=weight_attr, bias_attr=bias_attr
+        )
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask=None):
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        bs, num_queries, n, c, h, w = (
+            q.shape[0],
+            q.shape[1],
+            self.num_heads,
+            self.hidden_dim // self.num_heads,
+            k.shape[-2],
+            k.shape[-1],
+        )
+        qh = q.reshape([bs, num_queries, n, c])
+        kh = k.reshape([bs, n, c, h, w])
+        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
+        kh = kh.reshape([-1, c, h * w])
+        weights = (
+            paddle.bmm(qh * self.normalize_fact, kh)
+            .reshape([bs, n, num_queries, h, w])
+            .transpose([0, 2, 1, 3, 4])
+        )
+
+        if mask is not None:
+            weights += mask
+        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
+        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
+        weights = self.dropout(weights)
+        return weights
+
+
+class MaskHeadFPNConv(nn.Layer):
+    """This code is based on
+    https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+    Simple convolutional head, using group norm.
+    Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
+        super().__init__()
+
+        inter_dims = [
+            input_dim,
+        ] + [context_dim // (2**i) for i in range(1, 5)]
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform()
+        )
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant()
+        )
+
+        self.conv0 = self._make_layers(
+            input_dim, input_dim, 3, num_groups, weight_attr, bias_attr
+        )
+        self.conv_inter = nn.LayerList()
+        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
+            self.conv_inter.append(
+                self._make_layers(
+                    in_dims, out_dims, 3, num_groups, weight_attr, bias_attr
+                )
+            )
+
+        self.conv_out = nn.Conv2D(
+            inter_dims[-1],
+            1,
+            3,
+            padding=1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+        )
+
+        self.adapter = nn.LayerList()
+        for i in range(len(fpn_dims)):
+            self.adapter.append(
+                nn.Conv2D(
+                    fpn_dims[i],
+                    inter_dims[i + 1],
+                    1,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr,
+                )
+            )
+
+    def _make_layers(
+        self,
+        in_dims,
+        out_dims,
+        kernel_size,
+        num_groups,
+        weight_attr=None,
+        bias_attr=None,
+    ):
+        return nn.Sequential(
+            nn.Conv2D(
+                in_dims,
+                out_dims,
+                kernel_size,
+                padding=kernel_size // 2,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr,
+            ),
+            nn.GroupNorm(num_groups, out_dims),
+            nn.ReLU(),
+        )
+
+    def forward(self, x, bbox_attention_map, fpns):
+        x = paddle.concat(
+            [
+                x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
+                bbox_attention_map.flatten(0, 1),
+            ],
+            1,
+        )
+        x = self.conv0(x)
+        for inter_layer, adapter_layer, feat in zip(
+            self.conv_inter[:-1], self.adapter, fpns
+        ):
+            feat = adapter_layer(feat).tile([bbox_attention_map.shape[1], 1, 1, 1])
+            x = inter_layer(x)
+            x = feat + F.interpolate(x, size=feat.shape[-2:])
+
+        x = self.conv_inter[-1](x)
+        x = self.conv_out(x)
+        return x
+
+
+class DETRHead(nn.Layer):
+    __shared__ = ["num_classes", "hidden_dim", "use_focal_loss"]
+    __inject__ = ["loss"]
+
+    def __init__(
+        self,
+        num_classes=80,
+        hidden_dim=256,
+        nhead=8,
+        num_mlp_layers=3,
+        loss="DETRLoss",
+        fpn_dims=[1024, 512, 256],
+        with_mask_head=False,
+        use_focal_loss=False,
+    ):
+        super(DETRHead, self).__init__()
+        # add background class
+        self.num_classes = num_classes if use_focal_loss else num_classes + 1
+        self.hidden_dim = hidden_dim
+        self.loss = loss
+        self.with_mask_head = with_mask_head
+        self.use_focal_loss = use_focal_loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(
+            hidden_dim, hidden_dim, output_dim=4, num_layers=num_mlp_layers
+        )
+        if self.with_mask_head:
+            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim, nhead)
+            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims, hidden_dim)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+
+        return {
+            "hidden_dim": hidden_dim,
+            "nhead": nhead,
+            "fpn_dims": [i.channels for i in input_shape[::-1]][1:],
+        }
+
+    @staticmethod
+    def get_gt_mask_from_polygons(gt_poly, pad_mask):
+        import pycocotools.mask as mask_util
+
+        out_gt_mask = []
+        for polygons, padding in zip(gt_poly, pad_mask):
+            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
+            masks = []
+            for obj_poly in polygons:
+                rles = mask_util.frPyObjects(obj_poly, height, width)
+                rle = mask_util.merge(rles)
+                masks.append(paddle.to_tensor(mask_util.decode(rle)).astype("float32"))
+            masks = paddle.stack(masks)
+            masks_pad = paddle.zeros(
+                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]]
+            )
+            masks_pad[:, :height, :width] = masks
+            out_gt_mask.append(masks_pad)
+        return out_gt_mask
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size, hidden_dim, h, w],
+                            src_proj: [batch_size, h*w, hidden_dim],
+                            src_mask: [batch_size, 1, 1, h, w])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, src_proj, src_mask = out_transformer
+        outputs_logit = self.score_head(feats)
+        outputs_bbox = F.sigmoid(self.bbox_head(feats))
+        outputs_seg = None
+        if self.with_mask_head:
+            bbox_attention_map = self.bbox_attention(feats[-1], memory, src_mask)
+            fpn_feats = [a for a in body_feats[::-1]][1:]
+            outputs_seg = self.mask_head(src_proj, bbox_attention_map, fpn_feats)
+            outputs_seg = outputs_seg.reshape(
+                [
+                    feats.shape[1],
+                    feats.shape[2],
+                    outputs_seg.shape[-2],
+                    outputs_seg.shape[-1],
+                ]
+            )
+
+        if self.training:
+            assert inputs is not None
+            assert "gt_bbox" in inputs and "gt_class" in inputs
+            gt_mask = (
+                self.get_gt_mask_from_polygons(inputs["gt_poly"], inputs["pad_mask"])
+                if "gt_poly" in inputs
+                else None
+            )
+            return self.loss(
+                outputs_bbox,
+                outputs_logit,
+                inputs["gt_bbox"],
+                inputs["gt_class"],
+                masks=outputs_seg,
+                gt_mask=gt_mask,
+            )
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
+
+
+class DeformableDETRHead(nn.Layer):
+    __shared__ = ["num_classes", "hidden_dim"]
+    __inject__ = ["loss"]
+
+    def __init__(
+        self, num_classes=80, hidden_dim=512, nhead=8, num_mlp_layers=3, loss="DETRLoss"
+    ):
+        super(DeformableDETRHead, self).__init__()
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.loss = loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(
+            hidden_dim, hidden_dim, output_dim=4, num_layers=num_mlp_layers
+        )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+        constant_(self.score_head.bias, -4.595)
+        constant_(self.bbox_head.layers[-1].weight)
+
+        with paddle.no_grad():
+            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
+            bias[2:] = -2.0
+            self.bbox_head.layers[-1].bias.set_value(bias)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+        return {"hidden_dim": hidden_dim, "nhead": nhead}
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size,
+                                \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
+                            reference_points: [batch_size, num_queries, 2])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, reference_points = out_transformer
+        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
+        outputs_bbox = self.bbox_head(feats)
+
+        # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
+        # but the gradient is wrong in paddle.
+        outputs_bbox = paddle.concat(
+            [outputs_bbox[:, :, :, :2] + reference_points, outputs_bbox[:, :, :, 2:]],
+            axis=-1,
+        )
+
+        outputs_bbox = F.sigmoid(outputs_bbox)
+        outputs_logit = self.score_head(feats)
+
+        if self.training:
+            assert inputs is not None
+            assert "gt_bbox" in inputs and "gt_class" in inputs
+
+            return self.loss(
+                outputs_bbox, outputs_logit, inputs["gt_bbox"], inputs["gt_class"]
+            )
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], None)
+
+
+class DINOHead(nn.Layer):
+    __inject__ = ["loss"]
+
+    def __init__(self, loss="DINOLoss", eval_idx=-1):
+        super(DINOHead, self).__init__()
+        self.loss = loss
+        self.eval_idx = eval_idx
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) = (
+            out_transformer
+        )
+        if self.training:
+            assert inputs is not None
+            assert "gt_bbox" in inputs and "gt_class" in inputs
+
+            if dn_meta is not None:
+                if isinstance(dn_meta, list):
+                    dual_groups = len(dn_meta) - 1
+                    dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dual_groups + 1, axis=2
+                    )
+                    dec_out_logits = paddle.split(
+                        dec_out_logits, dual_groups + 1, axis=2
+                    )
+                    enc_topk_bboxes = paddle.split(
+                        enc_topk_bboxes, dual_groups + 1, axis=1
+                    )
+                    enc_topk_logits = paddle.split(
+                        enc_topk_logits, dual_groups + 1, axis=1
+                    )
+
+                    dec_out_bboxes_list = []
+                    dec_out_logits_list = []
+                    dn_out_bboxes_list = []
+                    dn_out_logits_list = []
+                    loss = {}
+                    for g_id in range(dual_groups + 1):
+                        if dn_meta[g_id] is not None:
+                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
+                                dec_out_bboxes[g_id],
+                                dn_meta[g_id]["dn_num_split"],
+                                axis=2,
+                            )
+                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(
+                                dec_out_logits[g_id],
+                                dn_meta[g_id]["dn_num_split"],
+                                axis=2,
+                            )
+                        else:
+                            dn_out_bboxes_gid, dn_out_logits_gid = None, None
+                            dec_out_bboxes_gid = dec_out_bboxes[g_id]
+                            dec_out_logits_gid = dec_out_logits[g_id]
+                        out_bboxes_gid = paddle.concat(
+                            [enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid]
+                        )
+                        out_logits_gid = paddle.concat(
+                            [enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid]
+                        )
+                        loss_gid = self.loss(
+                            out_bboxes_gid,
+                            out_logits_gid,
+                            inputs["gt_bbox"],
+                            inputs["gt_class"],
+                            dn_out_bboxes=dn_out_bboxes_gid,
+                            dn_out_logits=dn_out_logits_gid,
+                            dn_meta=dn_meta[g_id],
+                        )
+                        # sum loss
+                        for key, value in loss_gid.items():
+                            loss.update({key: loss.get(key, paddle.zeros([1])) + value})
+
+                    # average across (dual_groups + 1)
+                    for key, value in loss.items():
+                        loss.update({key: value / (dual_groups + 1)})
+                    return loss
+                else:
+                    dn_out_bboxes, dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dn_meta["dn_num_split"], axis=2
+                    )
+                    dn_out_logits, dec_out_logits = paddle.split(
+                        dec_out_logits, dn_meta["dn_num_split"], axis=2
+                    )
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+
+            out_bboxes = paddle.concat([enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_logits = paddle.concat([enc_topk_logits.unsqueeze(0), dec_out_logits])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs["gt_bbox"],
+                inputs["gt_class"],
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_logits=dn_out_logits,
+                dn_meta=dn_meta,
+                gt_score=inputs.get("gt_score", None),
+            )
+        else:
+            return (dec_out_bboxes[self.eval_idx], dec_out_logits[self.eval_idx], None)
+
+
+class MaskDINOHead(nn.Layer):
+    __inject__ = ["loss"]
+
+    def __init__(self, loss="DINOLoss"):
+        super(MaskDINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out, dn_meta) = (
+            out_transformer
+        )
+        if self.training:
+            assert inputs is not None
+            assert "gt_bbox" in inputs and "gt_class" in inputs
+            assert "gt_segm" in inputs
+
+            if dn_meta is not None:
+                dn_out_logits, dec_out_logits = paddle.split(
+                    dec_out_logits, dn_meta["dn_num_split"], axis=2
+                )
+                dn_out_bboxes, dec_out_bboxes = paddle.split(
+                    dec_out_bboxes, dn_meta["dn_num_split"], axis=2
+                )
+                dn_out_masks, dec_out_masks = paddle.split(
+                    dec_out_masks, dn_meta["dn_num_split"], axis=2
+                )
+                if init_out is not None:
+                    init_out_logits, init_out_bboxes, init_out_masks = init_out
+                    init_out_logits_dn, init_out_logits = paddle.split(
+                        init_out_logits, dn_meta["dn_num_split"], axis=1
+                    )
+                    init_out_bboxes_dn, init_out_bboxes = paddle.split(
+                        init_out_bboxes, dn_meta["dn_num_split"], axis=1
+                    )
+                    init_out_masks_dn, init_out_masks = paddle.split(
+                        init_out_masks, dn_meta["dn_num_split"], axis=1
+                    )
+
+                    dec_out_logits = paddle.concat(
+                        [init_out_logits.unsqueeze(0), dec_out_logits]
+                    )
+                    dec_out_bboxes = paddle.concat(
+                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes]
+                    )
+                    dec_out_masks = paddle.concat(
+                        [init_out_masks.unsqueeze(0), dec_out_masks]
+                    )
+
+                    dn_out_logits = paddle.concat(
+                        [init_out_logits_dn.unsqueeze(0), dn_out_logits]
+                    )
+                    dn_out_bboxes = paddle.concat(
+                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes]
+                    )
+                    dn_out_masks = paddle.concat(
+                        [init_out_masks_dn.unsqueeze(0), dn_out_masks]
+                    )
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+                dn_out_masks = None
+
+            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
+            out_logits = paddle.concat([enc_out_logits.unsqueeze(0), dec_out_logits])
+            out_bboxes = paddle.concat([enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_masks = paddle.concat([enc_out_masks.unsqueeze(0), dec_out_masks])
+
+            inputs["gt_segm"] = [
+                gt_segm.astype(out_masks.dtype) for gt_segm in inputs["gt_segm"]
+            ]
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs["gt_bbox"],
+                inputs["gt_class"],
+                masks=out_masks,
+                gt_mask=inputs["gt_segm"],
+                dn_out_logits=dn_out_logits,
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_masks=dn_out_masks,
+                dn_meta=dn_meta,
+            )
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
+
+
+class RTDETRv3Head(nn.Layer):
+    __inject__ = ["loss"]
+    __shared__ = ["o2m_branch", "num_queries_o2m"]
+
+    def __init__(
+        self, loss="DINOLoss", eval_idx=-1, o2m=4, o2m_branch=False, num_queries_o2m=450
+    ):
+        super(RTDETRv3Head, self).__init__()
+        self.loss = loss
+        self.eval_idx = eval_idx
+        self.o2m = o2m
+        self.o2m_branch = o2m_branch
+        self.num_queries_o2m = num_queries_o2m
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) = (
+            out_transformer
+        )
+        if self.training:
+            assert inputs is not None
+            assert "gt_bbox" in inputs and "gt_class" in inputs
+
+            if dn_meta is not None:
+                num_groups = len(dn_meta)
+                total_dec_queries = dec_out_bboxes.shape[2]
+                total_enc_queries = enc_topk_bboxes.shape[1]
+                loss = {}
+                if self.o2m_branch:
+                    dec_out_bboxes, dec_out_bboxes_o2m = paddle.split(
+                        dec_out_bboxes,
+                        [
+                            total_dec_queries - self.num_queries_o2m,
+                            self.num_queries_o2m,
+                        ],
+                        axis=2,
+                    )
+                    dec_out_logits, dec_out_logits_o2m = paddle.split(
+                        dec_out_logits,
+                        [
+                            total_dec_queries - self.num_queries_o2m,
+                            self.num_queries_o2m,
+                        ],
+                        axis=2,
+                    )
+                    enc_topk_bboxes, enc_topk_bboxes_o2m = paddle.split(
+                        enc_topk_bboxes,
+                        [
+                            total_enc_queries - self.num_queries_o2m,
+                            self.num_queries_o2m,
+                        ],
+                        axis=1,
+                    )
+                    enc_topk_logits, enc_topk_logits_o2m = paddle.split(
+                        enc_topk_logits,
+                        [
+                            total_enc_queries - self.num_queries_o2m,
+                            self.num_queries_o2m,
+                        ],
+                        axis=1,
+                    )
+
+                    out_bboxes_o2m = paddle.concat(
+                        [enc_topk_bboxes_o2m.unsqueeze(0), dec_out_bboxes_o2m]
+                    )
+                    out_logits_o2m = paddle.concat(
+                        [enc_topk_logits_o2m.unsqueeze(0), dec_out_logits_o2m]
+                    )
+                    loss_o2m = self.loss(
+                        out_bboxes_o2m,
+                        out_logits_o2m,
+                        inputs["gt_bbox"],
+                        inputs["gt_class"],
+                        dn_out_bboxes=None,
+                        dn_out_logits=None,
+                        dn_meta=None,
+                        o2m=self.o2m,
+                    )
+                    for key, value in loss_o2m.items():
+                        key = key + "_o2m_branch"
+                        loss.update({key: loss.get(key, paddle.zeros([1])) + value})
+
+                split_dec_num = [sum(dn["dn_num_split"]) for dn in dn_meta]
+                split_enc_num = [dn["dn_num_split"][1] for dn in dn_meta]
+                dec_out_bboxes = paddle.split(dec_out_bboxes, split_dec_num, axis=2)
+                dec_out_logits = paddle.split(dec_out_logits, split_dec_num, axis=2)
+                enc_topk_bboxes = paddle.split(enc_topk_bboxes, split_enc_num, axis=1)
+                enc_topk_logits = paddle.split(enc_topk_logits, split_enc_num, axis=1)
+
+                for g_id in range(num_groups):
+                    dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
+                        dec_out_bboxes[g_id], dn_meta[g_id]["dn_num_split"], axis=2
+                    )
+                    dn_out_logits_gid, dec_out_logits_gid = paddle.split(
+                        dec_out_logits[g_id], dn_meta[g_id]["dn_num_split"], axis=2
+                    )
+                    out_bboxes_gid = paddle.concat(
+                        [enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid]
+                    )
+                    out_logits_gid = paddle.concat(
+                        [enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid]
+                    )
+
+                    loss_gid = self.loss(
+                        out_bboxes_gid,
+                        out_logits_gid,
+                        inputs["gt_bbox"],
+                        inputs["gt_class"],
+                        dn_out_bboxes=dn_out_bboxes_gid,
+                        dn_out_logits=dn_out_logits_gid,
+                        dn_meta=dn_meta[g_id],
+                    )
+                    # sum loss
+                    for key, value in loss_gid.items():
+                        loss.update({key: loss.get(key, paddle.zeros([1])) + value})
+
+                # average across (dual_groups + 1)
+                for key, value in loss.items():
+                    if "_o2m_branch" not in key:
+                        loss.update({key: value / num_groups})
+                return loss
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+
+            out_bboxes = paddle.concat([enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_logits = paddle.concat([enc_topk_logits.unsqueeze(0), dec_out_logits])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs["gt_bbox"],
+                inputs["gt_class"],
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_logits=dn_out_logits,
+                dn_meta=dn_meta,
+                gt_score=inputs.get("gt_score", None),
+            )
+        else:
+            return (dec_out_bboxes[self.eval_idx], dec_out_logits[self.eval_idx], None)
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/detr_transformer.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/detr_transformer.py
new file mode 100644
index 0000000000..7489219b2f
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/detr_transformer.py
@@ -0,0 +1,394 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .modules.detr_ops import _get_clones
+from .modules.initializer import conv_init_, linear_init_, normal_, xavier_uniform_
+from .modules.layers import MultiHeadAttention, _convert_attention_mask
+from .modules.position_encoding import PositionEmbedding
+
+__all__ = ["TransformerEncoderLayer"]
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+    ):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None, with_rp=-1):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        assert with_rp <= num_layers
+        self.with_rp = with_rp
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        output = src
+        for i, layer in enumerate(self.layers):
+            if self.training and i < self.with_rp:
+                output = recompute(
+                    layer,
+                    output,
+                    src_mask=src_mask,
+                    pos_embed=pos_embed,
+                    **{"preserve_rng_state": True, "use_reentrant": False},
+                )
+            else:
+                output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+    ):
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        pos_embed=None,
+        query_pos_embed=None,
+    ):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        q = self.with_pos_embed(tgt, query_pos_embed)
+        k = self.with_pos_embed(memory, pos_embed)
+        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        pos_embed=None,
+        query_pos_embed=None,
+    ):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                pos_embed=pos_embed,
+                query_pos_embed=query_pos_embed,
+            )
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class DETRTransformer(nn.Layer):
+    __shared__ = ["hidden_dim"]
+
+    def __init__(
+        self,
+        num_queries=100,
+        position_embed_type="sine",
+        return_intermediate_dec=True,
+        backbone_num_channels=2048,
+        hidden_dim=256,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        pe_temperature=10000,
+        pe_offset=0.0,
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+    ):
+        super(DETRTransformer, self).__init__()
+        assert position_embed_type in [
+            "sine",
+            "learned",
+        ], f"ValueError: position_embed_type not supported {position_embed_type}!"
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            attn_dropout,
+            act_dropout,
+            normalize_before,
+        )
+        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm
+        )
+
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            attn_dropout,
+            act_dropout,
+            normalize_before,
+        )
+        decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )
+
+        self.input_proj = nn.Conv2D(backbone_num_channels, hidden_dim, kernel_size=1)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == "sine" else False,
+            embed_type=position_embed_type,
+            offset=pe_offset,
+        )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+        conv_init_(self.input_proj)
+        normal_(self.query_pos_embed.weight)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            "backbone_num_channels": [i.channels for i in input_shape][-1],
+        }
+
+    def _convert_attention_mask(self, mask):
+        return (mask - 1.0) * 1e9
+
+    def forward(self, src, src_mask=None, *args, **kwargs):
+        r"""
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                [bs, H, W]`. When the data type is bool, the unwanted positions
+                have `False` values and the others have `True` values. When the
+                data type is int, the unwanted positions have 0 values and the
+                others have 1 values. When the data type is float, the unwanted
+                positions have `-INF` values and the others have 0 values. It
+                can be None when nothing wanted or needed to be prevented
+                attention to. Default None.
+
+        Returns:
+            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
+            memory (Tensor): [batch_size, hidden_dim, h, w]
+        """
+        # use last level feature map
+        src_proj = self.input_proj(src[-1])
+        bs, c, h, w = src_proj.shape
+        # flatten [B, C, H, W] to [B, HxW, C]
+        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
+        if src_mask is not None:
+            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+        else:
+            src_mask = paddle.ones([bs, h, w])
+        pos_embed = self.position_embedding(src_mask).flatten(1, 2)
+
+        if self.training:
+            src_mask = self._convert_attention_mask(src_mask)
+            src_mask = src_mask.reshape([bs, 1, 1, h * w])
+        else:
+            src_mask = None
+
+        memory = self.encoder(src_flatten, src_mask=src_mask, pos_embed=pos_embed)
+
+        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        tgt = paddle.zeros_like(query_pos_embed)
+        output = self.decoder(
+            tgt,
+            memory,
+            memory_mask=src_mask,
+            pos_embed=pos_embed,
+            query_pos_embed=query_pos_embed,
+        )
+
+        if self.training:
+            src_mask = src_mask.reshape([bs, 1, 1, h, w])
+        else:
+            src_mask = None
+
+        return (
+            output,
+            memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
+            src_proj,
+            src_mask,
+        )
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/hgnet_v2.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/hgnet_v2.py
new file mode 100644
index 0000000000..3c60cd6831
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/hgnet_v2.py
@@ -0,0 +1,513 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import BatchNorm2D, Conv2D, ReLU
+from paddle.nn.initializer import Constant, KaimingNormal
+from paddle.regularizer import L2Decay
+
+from .modules.detr_ops import ShapeSpec
+
+__all__ = ["PPHGNetV2"]
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.0)
+ones_ = Constant(value=1.0)
+
+
+class LearnableAffineBlock(nn.Layer):
+    def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        groups=1,
+        use_act=True,
+        use_lab=False,
+        lr_mult=1.0,
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+        )
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+        )
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        groups=1,
+        use_lab=False,
+        lr_mult=1.0,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Layer):
+    def __init__(
+        self, in_channels, mid_channels, out_channels, use_lab=False, lr_mult=1.0
+    ):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME"
+        )
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HG_Block(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        mid_channels,
+        out_channels,
+        kernel_size=3,
+        layer_num=6,
+        identity=False,
+        light_block=True,
+        use_lab=False,
+        lr_mult=1.0,
+    ):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(
+                    in_channels=in_channels if i == 0 else mid_channels,
+                    out_channels=mid_channels,
+                    stride=1,
+                    kernel_size=kernel_size,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult,
+                )
+            )
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        mid_channels,
+        out_channels,
+        block_num,
+        layer_num=6,
+        downsample=True,
+        light_block=True,
+        kernel_size=3,
+        use_lab=False,
+        lr_mult=1.0,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult,
+            )
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+def _freeze_norm(m: nn.BatchNorm2D):
+    param_attr = ParamAttr(learning_rate=0.0, regularizer=L2Decay(0.0), trainable=False)
+    bias_attr = ParamAttr(learning_rate=0.0, regularizer=L2Decay(0.0), trainable=False)
+    global_stats = True
+    norm = nn.BatchNorm2D(
+        m._num_features,
+        weight_attr=param_attr,
+        bias_attr=bias_attr,
+        use_global_stats=global_stats,
+    )
+    for param in norm.parameters():
+        param.stop_gradient = True
+    return norm
+
+
+def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
+    if isinstance(model, nn.BatchNorm2D):
+        model = reset_func(model)
+    else:
+        for name, child in model.named_children():
+            _child = reset_bn(child, reset_func)
+            if _child is not child:
+                setattr(model, name, _child)
+    return model
+
+
+class PPHGNetV2(nn.Layer):
+    """
+    PPHGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        "N": {
+            "stem_channels": [3, 16, 16],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [16, 16, 64, 1, False, False, 3, 3],
+                "stage2": [64, 32, 256, 1, True, False, 3, 3],
+                "stage3": [256, 64, 512, 2, True, True, 5, 3],
+                "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+            },
+        },
+        "S": {
+            "stem_channels": [3, 24, 32],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 64, 1, False, False, 3, 3],
+                "stage2": [64, 48, 256, 1, True, False, 3, 3],
+                "stage3": [256, 96, 512, 2, True, True, 5, 3],
+                "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+            },
+        },
+        "M": {
+            "stem_channels": [3, 24, 32],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 96, 1, False, False, 3, 4],
+                "stage2": [96, 64, 384, 1, True, False, 3, 4],
+                "stage3": [384, 128, 768, 3, True, True, 5, 4],
+                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+            },
+        },
+        "L": {
+            "stem_channels": [3, 32, 48],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+        },
+        "X": {
+            "stem_channels": [3, 32, 64],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+        },
+        "H": {
+            "stem_channels": [3, 48, 96],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [96, 96, 192, 2, False, False, 3, 6],
+                "stage2": [192, 192, 512, 3, True, False, 3, 6],
+                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+            },
+        },
+    }
+
+    def __init__(
+        self,
+        arch,
+        use_lab=False,
+        lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+        return_idx=[1, 2, 3],
+        freeze_stem_only=True,
+        freeze_at=0,
+        freeze_norm=True,
+    ):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[arch]["stem_channels"]
+        stage_config = self.arch_configs[arch]["stage_config"]
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0],
+        )
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            (
+                in_channels,
+                mid_channels,
+                out_channels,
+                block_num,
+                downsample,
+                light_block,
+                kernel_size,
+                layer_num,
+            ) = stage_config[k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1],
+                )
+            )
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            reset_bn(self, reset_func=_freeze_norm)
+
+        self._init_weights()
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/hybrid_encoder.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/hybrid_encoder.py
new file mode 100644
index 0000000000..257f583b3d
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/hybrid_encoder.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from .detr_transformer import TransformerEncoder
+from .modules.csp_darknet import BaseConv
+from .modules.cspresnet import RepVggBlock
+from .modules.detr_ops import ShapeSpec
+from .modules.initializer import linear_init_
+from .modules.layers import MultiHeadAttention
+from .modules.ops import get_act_fn
+
+__all__ = ["HybridEncoder"]
+
+
+class CSPRepLayer(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_blocks=3,
+        expansion=1.0,
+        bias=False,
+        act="silu",
+    ):
+        super(CSPRepLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.bottlenecks = nn.Sequential(
+            *[
+                RepVggBlock(hidden_channels, hidden_channels, act=act)
+                for _ in range(num_blocks)
+            ]
+        )
+        if hidden_channels != out_channels:
+            self.conv3 = BaseConv(
+                hidden_channels, out_channels, ksize=1, stride=1, bias=bias, act=act
+            )
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+
+
+class TransformerLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=1024,
+        dropout=0.0,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+    ):
+        super(TransformerLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class HybridEncoder(nn.Layer):
+    __shared__ = ["depth_mult", "act", "trt", "eval_size"]
+    __inject__ = ["encoder_layer"]
+
+    def __init__(
+        self,
+        in_channels=[512, 1024, 2048],
+        feat_strides=[8, 16, 32],
+        hidden_dim=256,
+        use_encoder_idx=[2],
+        num_encoder_layers=1,
+        encoder_layer="TransformerLayer",
+        pe_temperature=10000,
+        expansion=1.0,
+        depth_mult=1.0,
+        act="silu",
+        trt=False,
+        eval_size=None,
+        with_rp=-1,
+    ):
+        super(HybridEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_size = eval_size
+
+        # channel projection
+        self.input_proj = nn.LayerList()
+        for in_channel in in_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(in_channel, hidden_dim, kernel_size=1, bias_attr=False),
+                    nn.BatchNorm2D(
+                        hidden_dim,
+                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                    ),
+                )
+            )
+        # encoder transformer
+        self.encoder = nn.LayerList(
+            [
+                TransformerEncoder(encoder_layer, num_encoder_layers, with_rp=with_rp)
+                for _ in range(len(use_encoder_idx))
+            ]
+        )
+
+        act = (
+            get_act_fn(act, trt=trt)
+            if act is None or isinstance(act, (str, dict))
+            else act
+        )
+        # top-down fpn
+        self.lateral_convs = nn.LayerList()
+        self.fpn_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(BaseConv(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion,
+                )
+            )
+
+        # bottom-up pan
+        self.downsample_convs = nn.LayerList()
+        self.pan_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                BaseConv(hidden_dim, hidden_dim, 3, stride=2, act=act)
+            )
+            self.pan_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion,
+                )
+            )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_size[1] // stride,
+                    self.eval_size[0] // stride,
+                    self.hidden_dim,
+                    self.pe_temperature,
+                )
+                setattr(self, f"pos_embed{idx}", pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
+        grid_w = paddle.arange(int(w), dtype=paddle.float32)
+        grid_h = paddle.arange(int(h), dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert (
+            embed_dim % 4 == 0
+        ), "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return paddle.concat(
+            [
+                paddle.sin(out_w),
+                paddle.cos(out_w),
+                paddle.sin(out_h),
+                paddle.cos(out_h),
+            ],
+            axis=1,
+        )[None, :, :]
+
+    def forward(self, feats, for_mot=False, is_teacher=False):
+        assert len(feats) == len(self.in_channels)
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).transpose([0, 2, 1])
+                if self.training or self.eval_size is None or is_teacher:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature
+                    )
+                else:
+                    pos_embed = getattr(self, f"pos_embed{enc_ind}", None)
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
+                    [-1, self.hidden_dim, h, w]
+                )
+
+        # top-down fpn
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
+            inner_outs[0] = feat_heigh
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat([upsample_feat, feat_low], axis=1)
+            )
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up pan
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](
+                paddle.concat([downsample_feat, feat_height], axis=1)
+            )
+            outs.append(out)
+
+        return outs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            "in_channels": [i.channels for i in input_shape],
+            "feat_strides": [i.stride for i in input_shape],
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self.hidden_dim, stride=self.feat_strides[idx])
+            for idx in range(len(self.in_channels))
+        ]
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/csp_darknet.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/csp_darknet.py
new file mode 100644
index 0000000000..8a8531aeb4
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/csp_darknet.py
@@ -0,0 +1,396 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from .detr_ops import ShapeSpec
+from .initializer import conv_init_
+
+__all__ = ["CSPDarkNet", "BaseConv", "DWConv", "BottleNeck", "SPPLayer", "SPPFLayer"]
+
+
+class BaseConv(nn.Layer):
+    def __init__(
+        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+    ):
+        super(BaseConv, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=(ksize - 1) // 2,
+            groups=groups,
+            bias_attr=bias,
+        )
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+        )
+
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        # use 'x * F.sigmoid(x)' replace 'silu'
+        x = self.bn(self.conv(x))
+        y = x * F.sigmoid(x)
+        return y
+
+
+class DWConv(nn.Layer):
+    """Depthwise Conv"""
+
+    def __init__(
+        self, in_channels, out_channels, ksize, stride=1, bias=False, act="silu"
+    ):
+        super(DWConv, self).__init__()
+        self.dw_conv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            bias=bias,
+            act=act,
+        )
+        self.pw_conv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, bias=bias, act=act
+        )
+
+    def forward(self, x):
+        return self.pw_conv(self.dw_conv(x))
+
+
+class Focus(nn.Layer):
+    """Focus width and height information into channel space, used in YOLOX."""
+
+    def __init__(
+        self, in_channels, out_channels, ksize=3, stride=1, bias=False, act="silu"
+    ):
+        super(Focus, self).__init__()
+        self.conv = BaseConv(
+            in_channels * 4,
+            out_channels,
+            ksize=ksize,
+            stride=stride,
+            bias=bias,
+            act=act,
+        )
+
+    def forward(self, inputs):
+        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
+        top_left = inputs[:, :, 0::2, 0::2]
+        top_right = inputs[:, :, 0::2, 1::2]
+        bottom_left = inputs[:, :, 1::2, 0::2]
+        bottom_right = inputs[:, :, 1::2, 1::2]
+        outputs = paddle.concat([top_left, bottom_left, top_right, bottom_right], 1)
+        return self.conv(outputs)
+
+
+class BottleNeck(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        bias=False,
+        act="silu",
+    ):
+        super(BottleNeck, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.conv2 = Conv(
+            hidden_channels, out_channels, ksize=3, stride=1, bias=bias, act=act
+        )
+        self.add_shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.add_shortcut:
+            y = y + x
+        return y
+
+
+class SPPLayer(nn.Layer):
+    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), bias=False, act="silu"
+    ):
+        super(SPPLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.maxpoolings = nn.LayerList(
+            [
+                nn.MaxPool2D(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
+        x = self.conv2(x)
+        return x
+
+
+class SPPFLayer(nn.Layer):
+    """Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
+    equivalent to SPP(k=(5, 9, 13))
+    """
+
+    def __init__(self, in_channels, out_channels, ksize=5, bias=False, act="silu"):
+        super(SPPFLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.maxpooling = nn.MaxPool2D(kernel_size=ksize, stride=1, padding=ksize // 2)
+        conv2_channels = hidden_channels * 4
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.maxpooling(x)
+        y2 = self.maxpooling(y1)
+        y3 = self.maxpooling(y2)
+        concats = paddle.concat([x, y1, y2, y3], axis=1)
+        out = self.conv2(concats)
+        return out
+
+
+class CSPLayer(nn.Layer):
+    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_blocks=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        bias=False,
+        act="silu",
+    ):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+        self.bottlenecks = nn.Sequential(
+            *[
+                BottleNeck(
+                    hidden_channels,
+                    hidden_channels,
+                    shortcut=shortcut,
+                    expansion=1.0,
+                    depthwise=depthwise,
+                    bias=bias,
+                    act=act,
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+        self.conv3 = BaseConv(
+            hidden_channels * 2, out_channels, ksize=1, stride=1, bias=bias, act=act
+        )
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        x = paddle.concat([x_1, x_2], axis=1)
+        x = self.conv3(x)
+        return x
+
+
+class CSPDarkNet(nn.Layer):
+    """
+    CSPDarkNet backbone.
+    Args:
+        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
+            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ["depth_mult", "width_mult", "act", "trt"]
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
+    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
+    arch_settings = {
+        "X": [
+            [64, 128, 3, True, False],
+            [128, 256, 9, True, False],
+            [256, 512, 9, True, False],
+            [512, 1024, 3, False, True],
+        ],
+        "P5": [
+            [64, 128, 3, True, False],
+            [128, 256, 6, True, False],
+            [256, 512, 9, True, False],
+            [512, 1024, 3, True, True],
+        ],
+        "P6": [
+            [64, 128, 3, True, False],
+            [128, 256, 6, True, False],
+            [256, 512, 9, True, False],
+            [512, 768, 3, True, False],
+            [768, 1024, 3, True, True],
+        ],
+    }
+
+    def __init__(
+        self,
+        arch="X",
+        depth_mult=1.0,
+        width_mult=1.0,
+        depthwise=False,
+        act="silu",
+        trt=False,
+        return_idx=[2, 3, 4],
+    ):
+        super(CSPDarkNet, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+        arch_setting = self.arch_settings[arch]
+        base_channels = int(arch_setting[0][0] * width_mult)
+
+        # Note: differences between the latest YOLOv5 and the original YOLOX
+        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
+        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
+        if arch in ["P5", "P6"]:
+            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
+            self.stem = Conv(3, base_channels, ksize=6, stride=2, bias=False, act=act)
+            spp_kernal_sizes = 5
+        elif arch in ["X"]:
+            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
+            self.stem = Focus(3, base_channels, ksize=3, stride=1, bias=False, act=act)
+            spp_kernal_sizes = (5, 9, 13)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(arch))
+
+        _out_channels = [base_channels]
+        layers_num = 1
+        self.csp_dark_blocks = []
+
+        for i, (in_channels, out_channels, num_blocks, shortcut, use_spp) in enumerate(
+            arch_setting
+        ):
+            in_channels = int(in_channels * width_mult)
+            out_channels = int(out_channels * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(num_blocks * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                "layers{}.stage{}.conv_layer".format(layers_num, i + 1),
+                Conv(in_channels, out_channels, 3, 2, bias=False, act=act),
+            )
+            stage.append(conv_layer)
+            layers_num += 1
+
+            if use_spp and arch in ["X"]:
+                # in YOLOX use SPPLayer
+                spp_layer = self.add_sublayer(
+                    "layers{}.stage{}.spp_layer".format(layers_num, i + 1),
+                    SPPLayer(
+                        out_channels,
+                        out_channels,
+                        kernel_sizes=spp_kernal_sizes,
+                        bias=False,
+                        act=act,
+                    ),
+                )
+                stage.append(spp_layer)
+                layers_num += 1
+
+            csp_layer = self.add_sublayer(
+                "layers{}.stage{}.csp_layer".format(layers_num, i + 1),
+                CSPLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act,
+                ),
+            )
+            stage.append(csp_layer)
+            layers_num += 1
+
+            if use_spp and arch in ["P5", "P6"]:
+                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
+                sppf_layer = self.add_sublayer(
+                    "layers{}.stage{}.sppf_layer".format(layers_num, i + 1),
+                    SPPFLayer(out_channels, out_channels, ksize=5, bias=False, act=act),
+                )
+                stage.append(sppf_layer)
+                layers_num += 1
+
+            self.csp_dark_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs["image"]
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_dark_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/cspresnet.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/cspresnet.py
new file mode 100644
index 0000000000..7ac992aebc
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/cspresnet.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.regularizer import L2Decay
+
+from .detr_ops import ShapeSpec
+from .ops import get_act_fn
+
+__all__ = ["CSPResNet", "BasicBlock", "EffectiveSELayer", "ConvBNLayer"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+        self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, act=None
+    ):
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False,
+        )
+
+        self.bn = nn.BatchNorm2D(
+            ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+        )
+        self.act = (
+            get_act_fn(act) if act is None or isinstance(act, (str, dict)) else act
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVggBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, act="relu", alpha=False):
+        super(RepVggBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = (
+            get_act_fn(act) if act is None or isinstance(act, (str, dict)) else act
+        )
+        if alpha:
+            self.alpha = self.create_parameter(
+                shape=[1],
+                attr=ParamAttr(initializer=Constant(value=1.0)),
+                dtype="float32",
+            )
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, "conv"):
+            y = self.conv(x)
+        else:
+            if self.alpha is not None:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, "conv"):
+            self.conv = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1,
+            )
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.set_value(kernel)
+        self.conv.bias.set_value(bias)
+        self.__delattr__("conv1")
+        self.__delattr__("conv2")
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha is not None:
+            return (
+                kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(kernel1x1),
+                bias3x3 + self.alpha * bias1x1,
+            )
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, act="relu", shortcut=True, use_alpha=False):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return paddle.add(x, y)
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Layer):
+    """Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act="hardsigmoid"):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
+        self.act = (
+            get_act_fn(act) if act is None or isinstance(act, (str, dict)) else act
+        )
+
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        return x * self.act(x_se)
+
+
+class CSPResStage(nn.Layer):
+    def __init__(
+        self,
+        block_fn,
+        ch_in,
+        ch_out,
+        n,
+        stride,
+        act="relu",
+        attn="eca",
+        use_alpha=False,
+    ):
+        super(CSPResStage, self).__init__()
+
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    ch_mid // 2,
+                    ch_mid // 2,
+                    act=act,
+                    shortcut=True,
+                    use_alpha=use_alpha,
+                )
+                for i in range(n)
+            ]
+        )
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act="hardsigmoid")
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = paddle.concat([y1, y2], axis=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+class CSPResNet(nn.Layer):
+    __shared__ = ["width_mult", "depth_mult", "trt"]
+
+    def __init__(
+        self,
+        layers=[3, 6, 6, 3],
+        channels=[64, 128, 256, 512, 1024],
+        act="swish",
+        return_idx=[1, 2, 3],
+        depth_wise=False,
+        use_large_stem=False,
+        width_mult=1.0,
+        depth_mult=1.0,
+        trt=False,
+        use_checkpoint=False,
+        use_alpha=False,
+        **args
+    ):
+        super(CSPResNet, self).__init__()
+        self.use_checkpoint = use_checkpoint
+        channels = [max(round(c * width_mult), 1) for c in channels]
+        layers = [max(round(l * depth_mult), 1) for l in layers]
+        act = (
+            get_act_fn(act, trt=trt)
+            if act is None or isinstance(act, (str, dict))
+            else act
+        )
+
+        if use_large_stem:
+            self.stem = nn.Sequential(
+                (
+                    "conv1",
+                    ConvBNLayer(3, channels[0] // 2, 3, stride=2, padding=1, act=act),
+                ),
+                (
+                    "conv2",
+                    ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0] // 2,
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act,
+                    ),
+                ),
+                (
+                    "conv3",
+                    ConvBNLayer(
+                        channels[0] // 2, channels[0], 3, stride=1, padding=1, act=act
+                    ),
+                ),
+            )
+        else:
+            self.stem = nn.Sequential(
+                (
+                    "conv1",
+                    ConvBNLayer(3, channels[0] // 2, 3, stride=2, padding=1, act=act),
+                ),
+                (
+                    "conv2",
+                    ConvBNLayer(
+                        channels[0] // 2, channels[0], 3, stride=1, padding=1, act=act
+                    ),
+                ),
+            )
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(
+            *[
+                (
+                    str(i),
+                    CSPResStage(
+                        BasicBlock,
+                        channels[i],
+                        channels[i + 1],
+                        layers[i],
+                        2,
+                        act=act,
+                        use_alpha=use_alpha,
+                    ),
+                )
+                for i in range(n)
+            ]
+        )
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+        if use_checkpoint:
+            paddle.seed(0)
+
+    def forward(self, inputs):
+        x = inputs["image"]
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    stage, x, **{"preserve_rng_state": True}
+                )
+            else:
+                x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/deformable_transformer.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/deformable_transformer.py
new file mode 100644
index 0000000000..8d5045e153
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/deformable_transformer.py
@@ -0,0 +1,758 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+
+from __future__ import absolute_import, division, print_function
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.distributed.fleet.utils import recompute
+
+from .detr_ops import _get_clones, get_valid_ratio
+from .initializer import constant_, linear_init_, normal_, xavier_uniform_
+from .layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .utils import deformable_attention_core_func as ms_deformable_attn
+
+__all__ = ["DeformableTransformer"]
+
+
+class MSDeformableAttention(nn.Layer):
+    def __init__(
+        self, embed_dim=256, num_heads=8, num_levels=4, num_points=4, lr_mult=0.1
+    ):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(
+            embed_dim,
+            self.total_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
+
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        self.ms_deformable_attn_core = ms_deformable_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        constant_(self.sampling_offsets.weight)
+        thetas = paddle.arange(self.num_heads, dtype=paddle.float32) * (
+            2.0 * math.pi / self.num_heads
+        )
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
+            [1, self.num_levels, self.num_points, 1]
+        )
+        scaling = paddle.arange(1, self.num_points + 1, dtype=paddle.float32).reshape(
+            [1, 1, -1, 1]
+        )
+        grid_init *= scaling
+        self.sampling_offsets.bias.set_value(grid_init.flatten())
+        # attention_weights
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        # proj
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(
+        self,
+        query,
+        reference_points,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        value_mask=None,
+    ):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]
+        )
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points]
+        )
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points]
+        )
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2]
+            )
+            sampling_locations = reference_points.reshape(
+                [bs, Len_q, 1, self.num_levels, 1, 2]
+            ) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype)
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets
+                / self.num_points
+                * reference_points[:, :, None, :, None, 2:]
+                * 0.5
+            )
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
+                    reference_points.shape[-1]
+                )
+            )
+
+        output = self.ms_deformable_attn_core(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+        output = self.output_proj(output)
+
+        return output
+
+
+class DeformableTransformerEncoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model=256,
+        n_head=8,
+        dim_feedforward=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_points=4,
+        lr_mult=0.1,
+        weight_attr=None,
+        bias_attr=None,
+    ):
+        super(DeformableTransformerEncoderLayer, self).__init__()
+        # self attention
+        self.self_attn = MSDeformableAttention(
+            d_model, n_head, n_levels, n_points, lr_mult
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(
+        self,
+        src,
+        reference_points,
+        spatial_shapes,
+        level_start_index,
+        src_mask=None,
+        query_pos_embed=None,
+    ):
+        # self attention
+        src2 = self.self_attn(
+            self.with_pos_embed(src, query_pos_embed),
+            reference_points,
+            src,
+            spatial_shapes,
+            level_start_index,
+            src_mask,
+        )
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerEncoder(nn.Layer):
+    __inject__ = ["encoder_layer"]
+
+    def __init__(self, encoder_layer, num_layers, with_rp=-1):
+        super(DeformableTransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        assert with_rp <= num_layers
+        self.with_rp = with_rp
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
+        valid_ratios = valid_ratios.unsqueeze(1)
+        reference_points = []
+        for i, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset
+            )
+            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * H)
+            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * W)
+            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
+        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
+        reference_points = reference_points * valid_ratios
+        return reference_points
+
+    def forward(
+        self,
+        feat,
+        spatial_shapes,
+        level_start_index,
+        feat_mask=None,
+        query_pos_embed=None,
+        valid_ratios=None,
+    ):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones([feat.shape[0], spatial_shapes.shape[0], 2])
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios)
+        for i, layer in enumerate(self.layers):
+            if self.training and i < self.with_rp:
+                feat = recompute(
+                    layer,
+                    feat,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    feat_mask,
+                    query_pos_embed,
+                    **{"preserve_rng_state": True, "use_reentrant": False},
+                )
+            else:
+                feat = layer(
+                    feat,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    feat_mask,
+                    query_pos_embed,
+                )
+
+        return feat
+
+
+class DeformableTransformerDecoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model=256,
+        n_head=8,
+        dim_feedforward=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_points=4,
+        lr_mult=0.1,
+        weight_attr=None,
+        bias_attr=None,
+    ):
+        super(DeformableTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(
+            d_model, n_head, n_levels, n_points, lr_mult
+        )
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(
+        self,
+        tgt,
+        reference_points,
+        memory,
+        memory_spatial_shapes,
+        memory_level_start_index,
+        memory_mask=None,
+        query_pos_embed=None,
+    ):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt2 = self.self_attn(q, k, value=tgt)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed),
+            reference_points,
+            memory,
+            memory_spatial_shapes,
+            memory_level_start_index,
+            memory_mask,
+        )
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+class DeformableTransformerDecoder(nn.Layer):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super(DeformableTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+    def forward(
+        self,
+        tgt,
+        reference_points,
+        memory,
+        memory_spatial_shapes,
+        memory_level_start_index,
+        memory_mask=None,
+        query_pos_embed=None,
+    ):
+        output = tgt
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            output = layer(
+                output,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask,
+                query_pos_embed,
+            )
+
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class DeformableTransformer(nn.Layer):
+    __shared__ = ["hidden_dim"]
+
+    def __init__(
+        self,
+        num_queries=300,
+        position_embed_type="sine",
+        return_intermediate_dec=True,
+        in_feats_channel=[512, 1024, 2048],
+        num_feature_levels=4,
+        num_encoder_points=4,
+        num_decoder_points=4,
+        hidden_dim=256,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=1024,
+        dropout=0.1,
+        activation="relu",
+        lr_mult=0.1,
+        pe_temperature=10000,
+        pe_offset=-0.5,
+    ):
+        super(DeformableTransformer, self).__init__()
+        assert position_embed_type in [
+            "sine",
+            "learned",
+        ], f"ValueError: position_embed_type not supported {position_embed_type}!"
+        assert len(in_feats_channel) <= num_feature_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_feature_levels = num_feature_levels
+
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_feature_levels,
+            num_encoder_points,
+            lr_mult,
+        )
+        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
+
+        decoder_layer = DeformableTransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_feature_levels,
+            num_decoder_points,
+        )
+        self.decoder = DeformableTransformerDecoder(
+            decoder_layer, num_decoder_layers, return_intermediate_dec
+        )
+
+        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
+        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+
+        self.reference_points = nn.Linear(
+            hidden_dim,
+            2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
+
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )
+            )
+        in_channels = in_feats_channel[-1]
+        for _ in range(num_feature_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels, hidden_dim, kernel_size=3, stride=2, padding=1
+                    ),
+                    nn.GroupNorm(32, hidden_dim),
+                )
+            )
+            in_channels = hidden_dim
+
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == "sine" else False,
+            embed_type=position_embed_type,
+            offset=pe_offset,
+            eps=1e-4,
+        )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        normal_(self.level_embed.weight)
+        normal_(self.tgt_embed.weight)
+        normal_(self.query_pos_embed.weight)
+        xavier_uniform_(self.reference_points.weight)
+        constant_(self.reference_points.bias)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            "in_feats_channel": [i.channels for i in input_shape],
+        }
+
+    def forward(self, src_feats, src_mask=None, *args, **kwargs):
+        srcs = []
+        for i in range(len(src_feats)):
+            srcs.append(self.input_proj[i](src_feats[i]))
+        if self.num_feature_levels > len(srcs):
+            len_srcs = len(srcs)
+            for i in range(len_srcs, self.num_feature_levels):
+                if i == len_srcs:
+                    srcs.append(self.input_proj[i](src_feats[-1]))
+                else:
+                    srcs.append(self.input_proj[i](srcs[-1]))
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for level, src in enumerate(srcs):
+            src_shape = paddle.shape(src)
+            bs = src_shape[0:1]
+            h = src_shape[2:3]
+            w = src_shape[3:4]
+            spatial_shapes.append(paddle.concat([h, w]))
+            src = src.flatten(2).transpose([0, 2, 1])
+            src_flatten.append(src)
+            if src_mask is not None:
+                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[level]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask = mask.flatten(1)
+            mask_flatten.append(mask)
+        src_flatten = paddle.concat(src_flatten, 1)
+        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten, 1)
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [l, 2]
+        spatial_shapes = paddle.to_tensor(paddle.stack(spatial_shapes).astype("int64"))
+        # [l], 每一个level的起始index
+        level_start_index = paddle.concat(
+            [paddle.zeros([1], dtype="int64"), spatial_shapes.prod(1).cumsum(0)[:-1]]
+        )
+        # [b, l, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+
+        # encoder
+        memory = self.encoder(
+            src_flatten,
+            spatial_shapes,
+            level_start_index,
+            mask_flatten,
+            lvl_pos_embed_flatten,
+            valid_ratios,
+        )
+
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        reference_points = F.sigmoid(self.reference_points(query_embed))
+        reference_points_input = reference_points.unsqueeze(2) * valid_ratios.unsqueeze(
+            1
+        )
+
+        # decoder
+        hs = self.decoder(
+            tgt,
+            reference_points_input,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            mask_flatten,
+            query_embed,
+        )
+
+        return (hs, memory, reference_points)
+
+
+class QRDeformableTransformerDecoder(DeformableTransformerDecoder):
+    def __init__(
+        self,
+        decoder_layer,
+        num_layers,
+        start_q=None,
+        end_q=None,
+        return_intermediate=False,
+    ):
+        super(QRDeformableTransformerDecoder, self).__init__(
+            decoder_layer, num_layers, return_intermediate=return_intermediate
+        )
+        self.start_q = start_q
+        self.end_q = end_q
+
+    def forward(
+        self,
+        tgt,
+        reference_points,
+        memory,
+        memory_spatial_shapes,
+        memory_level_start_index,
+        memory_mask=None,
+        query_pos_embed=None,
+    ):
+
+        if not self.training:
+            return super(QRDeformableTransformerDecoder, self).forward(
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=memory_mask,
+                query_pos_embed=query_pos_embed,
+            )
+
+        batchsize = tgt.shape[0]
+        query_list_reserve = [tgt]
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+
+            start_q = self.start_q[lid]
+            end_q = self.end_q[lid]
+            query_list = query_list_reserve.copy()[start_q:end_q]
+
+            # prepare for parallel process
+            output = paddle.concat(query_list, axis=0)
+            fakesetsize = int(output.shape[0] / batchsize)
+            reference_points_tiled = reference_points.tile([fakesetsize, 1, 1, 1])
+
+            memory_tiled = memory.tile([fakesetsize, 1, 1])
+            query_pos_embed_tiled = query_pos_embed.tile([fakesetsize, 1, 1])
+            memory_mask_tiled = memory_mask.tile([fakesetsize, 1])
+
+            output = layer(
+                output,
+                reference_points_tiled,
+                memory_tiled,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask_tiled,
+                query_pos_embed_tiled,
+            )
+
+            for i in range(fakesetsize):
+                query_list_reserve.append(output[batchsize * i : batchsize * (i + 1)])
+
+            if self.return_intermediate:
+                for i in range(fakesetsize):
+                    intermediate.append(output[batchsize * i : batchsize * (i + 1)])
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class QRDeformableTransformer(DeformableTransformer):
+
+    def __init__(
+        self,
+        num_queries=300,
+        position_embed_type="sine",
+        return_intermediate_dec=True,
+        in_feats_channel=[512, 1024, 2048],
+        num_feature_levels=4,
+        num_encoder_points=4,
+        num_decoder_points=4,
+        hidden_dim=256,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=1024,
+        dropout=0.1,
+        activation="relu",
+        lr_mult=0.1,
+        pe_temperature=10000,
+        pe_offset=-0.5,
+        start_q=None,
+        end_q=None,
+    ):
+        super(QRDeformableTransformer, self).__init__(
+            num_queries=num_queries,
+            position_embed_type=position_embed_type,
+            return_intermediate_dec=return_intermediate_dec,
+            in_feats_channel=in_feats_channel,
+            num_feature_levels=num_feature_levels,
+            num_encoder_points=num_encoder_points,
+            num_decoder_points=num_decoder_points,
+            hidden_dim=hidden_dim,
+            nhead=nhead,
+            num_encoder_layers=num_encoder_layers,
+            num_decoder_layers=num_decoder_layers,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            lr_mult=lr_mult,
+            pe_temperature=pe_temperature,
+            pe_offset=pe_offset,
+        )
+
+        decoder_layer = DeformableTransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_feature_levels,
+            num_decoder_points,
+        )
+        self.decoder = QRDeformableTransformerDecoder(
+            decoder_layer, num_decoder_layers, start_q, end_q, return_intermediate_dec
+        )
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/detr_loss.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/detr_loss.py
new file mode 100644
index 0000000000..c18b59a4eb
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/detr_loss.py
@@ -0,0 +1,944 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .detr_ops import bbox_iou
+from .utils import (
+    bbox_cxcywh_to_xyxy,
+    mal_loss_with_logits,
+    sigmoid_focal_loss,
+    varifocal_loss_with_logits,
+)
+
+__all__ = ["DETRLoss", "DINOLoss", "RTDETRv3Loss"]
+
+
+class GIoULoss(object):
+    """
+    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        reduction (string): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1.0, eps=1e-10, reduction="none"):
+        self.loss_weight = loss_weight
+        self.eps = eps
+        assert reduction in ("none", "mean", "sum")
+        self.reduction = reduction
+
+    def bbox_overlap(self, box1, box2, eps=1e-10):
+        """calculate the iou of box1 and box2
+        Args:
+            box1 (Tensor): box1 with the shape (..., 4)
+            box2 (Tensor): box1 with the shape (..., 4)
+            eps (float): epsilon to avoid divide by zero
+        Return:
+            iou (Tensor): iou of box1 and box2
+            overlap (Tensor): overlap of box1 and box2
+            union (Tensor): union of box1 and box2
+        """
+        x1, y1, x2, y2 = box1
+        x1g, y1g, x2g, y2g = box2
+
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+        w_inter = (xkis2 - xkis1).clip(0)
+        h_inter = (ykis2 - ykis1).clip(0)
+        overlap = w_inter * h_inter
+
+        area1 = (x2 - x1) * (y2 - y1)
+        area2 = (x2g - x1g) * (y2g - y1g)
+        union = area1 + area2 - overlap + eps
+        iou = overlap / union
+
+        return iou, overlap, union
+
+    def __call__(self, pbox, gbox, iou_weight=1.0, loc_reweight=None):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
+        miou = iou - ((area_c - union) / area_c)
+        if loc_reweight is not None:
+            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
+            loc_thresh = 0.9
+            giou = 1 - (1 - loc_thresh) * miou - loc_thresh * miou * loc_reweight
+        else:
+            giou = 1 - miou
+        if self.reduction == "none":
+            loss = giou
+        elif self.reduction == "sum":
+            loss = paddle.sum(giou * iou_weight)
+        else:
+            loss = paddle.mean(giou * iou_weight)
+        return loss * self.loss_weight
+
+
+class DETRLoss(nn.Layer):
+    __shared__ = ["num_classes", "use_focal_loss"]
+    __inject__ = ["matcher"]
+
+    def __init__(
+        self,
+        num_classes=80,
+        matcher="HungarianMatcher",
+        loss_coeff={
+            "class": 1,
+            "bbox": 5,
+            "giou": 2,
+            "no_object": 0.1,
+            "mask": 1,
+            "dice": 1,
+        },
+        aux_loss=True,
+        use_focal_loss=True,
+        use_mal=False,
+        use_vfl=False,
+        vfl_iou_type="bbox",
+        use_uni_match=False,
+        uni_match_ind=0,
+    ):
+        r"""
+        Args:
+            num_classes (int): The number of classes.
+            matcher (HungarianMatcher): It computes an assignment between the targets
+                and the predictions of the network.
+            loss_coeff (dict): The coefficient of loss.
+            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
+            use_focal_loss (bool): Use focal loss or not.
+        """
+        super(DETRLoss, self).__init__()
+
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.loss_coeff = loss_coeff
+        self.aux_loss = aux_loss
+        self.use_focal_loss = use_focal_loss
+        self.use_mal = use_mal
+        self.use_vfl = use_vfl
+        self.vfl_iou_type = vfl_iou_type
+        self.use_uni_match = use_uni_match
+        self.uni_match_ind = uni_match_ind
+
+        if not self.use_focal_loss:
+            self.loss_coeff["class"] = paddle.full(
+                [num_classes + 1], loss_coeff["class"]
+            )
+            self.loss_coeff["class"][-1] = loss_coeff["no_object"]
+        self.giou_loss = GIoULoss()
+
+    def _get_loss_class(
+        self,
+        logits,
+        gt_class,
+        match_indices,
+        bg_index,
+        num_gts,
+        postfix="",
+        iou_score=None,
+        gt_score=None,
+    ):
+        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        name_class = "loss_class" + postfix
+
+        target_label = paddle.full(logits.shape[:2], bg_index, dtype="int64")
+        bs, num_query_objects = target_label.shape
+        num_gt = sum(len(a) for a in gt_class)
+        if num_gt > 0:
+            index, updates = self._get_index_updates(
+                num_query_objects, gt_class, match_indices
+            )
+            target_label = paddle.scatter(
+                target_label.reshape([-1, 1]), index, updates.astype("int64")
+            )
+            target_label = target_label.reshape([bs, num_query_objects])
+        if self.use_focal_loss:
+            target_label = F.one_hot(target_label, self.num_classes + 1)[..., :-1]
+            if iou_score is not None and (self.use_vfl or self.use_mal):
+                if gt_score is not None:
+                    target_score = paddle.zeros([bs, num_query_objects])
+                    target_score = paddle.scatter(
+                        target_score.reshape([-1, 1]), index, gt_score
+                    )
+                    target_score = (
+                        target_score.reshape([bs, num_query_objects, 1]) * target_label
+                    )
+
+                    target_score_iou = paddle.zeros([bs, num_query_objects])
+                    target_score_iou = paddle.scatter(
+                        target_score_iou.reshape([-1, 1]), index, iou_score
+                    )
+                    target_score_iou = (
+                        target_score_iou.reshape([bs, num_query_objects, 1])
+                        * target_label
+                    )
+                    target_score = paddle.multiply(target_score, target_score_iou)
+                    if self.use_mal:
+                        loss_ = self.loss_coeff["class"] * mal_loss_with_logits(
+                            logits,
+                            target_score,
+                            target_label,
+                            num_gts / num_query_objects,
+                        )
+                    else:
+                        loss_ = self.loss_coeff["class"] * varifocal_loss_with_logits(
+                            logits,
+                            target_score,
+                            target_label,
+                            num_gts / num_query_objects,
+                        )
+                else:
+                    target_score = paddle.zeros([bs, num_query_objects])
+                    if num_gt > 0:
+                        target_score = paddle.scatter(
+                            target_score.reshape([-1, 1]), index, iou_score
+                        )
+                    target_score = (
+                        target_score.reshape([bs, num_query_objects, 1]) * target_label
+                    )
+                    if self.use_mal:
+                        loss_ = self.loss_coeff["class"] * mal_loss_with_logits(
+                            logits,
+                            target_score,
+                            target_label,
+                            num_gts / num_query_objects,
+                        )
+                    else:
+                        loss_ = self.loss_coeff["class"] * varifocal_loss_with_logits(
+                            logits,
+                            target_score,
+                            target_label,
+                            num_gts / num_query_objects,
+                        )
+            else:
+                loss_ = self.loss_coeff["class"] * sigmoid_focal_loss(
+                    logits, target_label, num_gts / num_query_objects
+                )
+        else:
+            loss_ = F.cross_entropy(
+                logits, target_label, weight=self.loss_coeff["class"]
+            )
+        return {name_class: loss_}
+
+    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts, postfix=""):
+        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        name_bbox = "loss_bbox" + postfix
+        name_giou = "loss_giou" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_bbox) == 0:
+            loss[name_bbox] = paddle.to_tensor([0.0])
+            loss[name_giou] = paddle.to_tensor([0.0])
+            return loss
+
+        src_bbox, target_bbox = self._get_src_target_assign(
+            boxes, gt_bbox, match_indices
+        )
+        loss[name_bbox] = (
+            self.loss_coeff["bbox"]
+            * F.l1_loss(src_bbox, target_bbox, reduction="sum")
+            / num_gts
+        )
+        loss[name_giou] = self.giou_loss(
+            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox)
+        )
+        loss[name_giou] = loss[name_giou].sum() / num_gts
+        loss[name_giou] = self.loss_coeff["giou"] * loss[name_giou]
+        return loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.0])
+            loss[name_dice] = paddle.to_tensor([0.0])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(
+            masks, gt_mask, match_indices
+        )
+        src_masks = F.interpolate(
+            src_masks.unsqueeze(0), size=target_masks.shape[-2:], mode="bilinear"
+        )[0]
+        loss[name_mask] = self.loss_coeff["mask"] * F.sigmoid_focal_loss(
+            src_masks, target_masks, paddle.to_tensor([num_gts], dtype="float32")
+        )
+        loss[name_dice] = self.loss_coeff["dice"] * self._dice_loss(
+            src_masks, target_masks, num_gts
+        )
+        return loss
+
+    def _dice_loss(self, inputs, targets, num_gts):
+        inputs = F.sigmoid(inputs)
+        inputs = inputs.flatten(1)
+        targets = targets.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+        denominator = inputs.sum(-1) + targets.sum(-1)
+        loss = 1 - (numerator + 1) / (denominator + 1)
+        return loss.sum() / num_gts
+
+    def _get_loss_aux(
+        self,
+        boxes,
+        logits,
+        gt_bbox,
+        gt_class,
+        bg_index,
+        num_gts,
+        dn_match_indices=None,
+        postfix="",
+        masks=None,
+        gt_mask=None,
+        gt_score=None,
+    ):
+        loss_class = []
+        loss_bbox, loss_giou = [], []
+        loss_mask, loss_dice = [], []
+        if dn_match_indices is not None:
+            match_indices = dn_match_indices
+        elif self.use_uni_match:
+            match_indices = self.matcher(
+                boxes[self.uni_match_ind],
+                logits[self.uni_match_ind],
+                gt_bbox,
+                gt_class,
+                masks=masks[self.uni_match_ind] if masks is not None else None,
+                gt_mask=gt_mask,
+            )
+        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
+            aux_masks = masks[i] if masks is not None else None
+            if not self.use_uni_match and dn_match_indices is None:
+                match_indices = self.matcher(
+                    aux_boxes,
+                    aux_logits,
+                    gt_bbox,
+                    gt_class,
+                    masks=aux_masks,
+                    gt_mask=gt_mask,
+                )
+            if self.use_vfl or self.use_mal:
+                if sum(len(a) for a in gt_bbox) > 0:
+                    src_bbox, target_bbox = self._get_src_target_assign(
+                        aux_boxes.detach(), gt_bbox, match_indices
+                    )
+                    iou_score = bbox_iou(
+                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1),
+                    )
+                else:
+                    iou_score = None
+                if gt_score is not None:
+                    _, target_score = self._get_src_target_assign(
+                        logits[-1].detach(), gt_score, match_indices
+                    )
+            else:
+                iou_score = None
+            loss_class.append(
+                self._get_loss_class(
+                    aux_logits,
+                    gt_class,
+                    match_indices,
+                    bg_index,
+                    num_gts,
+                    postfix,
+                    iou_score,
+                    gt_score=target_score if gt_score is not None else None,
+                )["loss_class" + postfix]
+            )
+            loss_ = self._get_loss_bbox(
+                aux_boxes, gt_bbox, match_indices, num_gts, postfix
+            )
+            loss_bbox.append(loss_["loss_bbox" + postfix])
+            loss_giou.append(loss_["loss_giou" + postfix])
+            if masks is not None and gt_mask is not None:
+                loss_ = self._get_loss_mask(
+                    aux_masks, gt_mask, match_indices, num_gts, postfix
+                )
+                loss_mask.append(loss_["loss_mask" + postfix])
+                loss_dice.append(loss_["loss_dice" + postfix])
+        loss = {
+            "loss_class_aux" + postfix: paddle.add_n(loss_class),
+            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
+            "loss_giou_aux" + postfix: paddle.add_n(loss_giou),
+        }
+        if masks is not None and gt_mask is not None:
+            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
+            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
+        return loss
+
+    def _get_index_updates(self, num_query_objects, target, match_indices):
+        batch_idx = paddle.concat(
+            [paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)]
+        )
+        src_idx = paddle.concat([src for (src, _) in match_indices])
+        src_idx += batch_idx * num_query_objects
+        if "npu" in paddle.device.get_device():
+            target_assign = paddle.concat(
+                [
+                    paddle.gather(t.to(paddle.int32), dst.to(paddle.int32), axis=0)
+                    for t, (_, dst) in zip(target, match_indices)
+                ]
+            )
+        else:
+            target_assign = paddle.concat(
+                [
+                    paddle.gather(t, dst, axis=0)
+                    for t, (_, dst) in zip(target, match_indices)
+                ]
+            )
+        return src_idx, target_assign
+
+    def _get_src_target_assign(self, src, target, match_indices):
+        src_assign = paddle.concat(
+            [
+                (
+                    paddle.gather(t, I, axis=0)
+                    if len(I) > 0
+                    else paddle.zeros([0, t.shape[-1]])
+                )
+                for t, (I, _) in zip(src, match_indices)
+            ]
+        )
+        target_assign = paddle.concat(
+            [
+                (
+                    paddle.gather(t, J, axis=0)
+                    if len(J) > 0
+                    else paddle.zeros([0, t.shape[-1]])
+                )
+                for t, (_, J) in zip(target, match_indices)
+            ]
+        )
+        return src_assign, target_assign
+
+    def _get_num_gts(self, targets, dtype="float32"):
+        num_gts = sum(len(a) for a in targets)
+        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(num_gts)
+            num_gts /= paddle.distributed.get_world_size()
+        num_gts = paddle.clip(num_gts, min=1.0)
+        return num_gts
+
+    def _get_prediction_loss(
+        self,
+        boxes,
+        logits,
+        gt_bbox,
+        gt_class,
+        masks=None,
+        gt_mask=None,
+        postfix="",
+        dn_match_indices=None,
+        num_gts=1,
+        gt_score=None,
+    ):
+        if dn_match_indices is None:
+            match_indices = self.matcher(
+                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask
+            )
+        else:
+            match_indices = dn_match_indices
+
+        if self.use_vfl or self.use_mal:
+            if gt_score is not None:  # ssod
+                _, target_score = self._get_src_target_assign(
+                    logits[-1].detach(), gt_score, match_indices
+                )
+            elif sum(len(a) for a in gt_bbox) > 0:
+                if self.vfl_iou_type == "bbox":
+                    src_bbox, target_bbox = self._get_src_target_assign(
+                        boxes.detach(), gt_bbox, match_indices
+                    )
+                    iou_score = bbox_iou(
+                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1),
+                    )
+                elif self.vfl_iou_type == "mask":
+                    assert (
+                        masks is not None and gt_mask is not None,
+                        "Make sure the input has `mask` and `gt_mask`",
+                    )
+                    assert sum(len(a) for a in gt_mask) > 0
+                    src_mask, target_mask = self._get_src_target_assign(
+                        masks.detach(), gt_mask, match_indices
+                    )
+                    src_mask = F.interpolate(
+                        src_mask.unsqueeze(0),
+                        scale_factor=2,
+                        mode="bilinear",
+                        align_corners=False,
+                    ).squeeze(0)
+                    target_mask = F.interpolate(
+                        target_mask.unsqueeze(0),
+                        size=src_mask.shape[-2:],
+                        mode="bilinear",
+                        align_corners=False,
+                    ).squeeze(0)
+                    src_mask = src_mask.flatten(1)
+                    src_mask = F.sigmoid(src_mask)
+                    src_mask = paddle.where(src_mask > 0.5, 1.0, 0.0).astype(
+                        masks.dtype
+                    )
+                    target_mask = target_mask.flatten(1)
+                    target_mask = paddle.where(target_mask > 0.5, 1.0, 0.0).astype(
+                        masks.dtype
+                    )
+                    inter = (src_mask * target_mask).sum(1)
+                    union = src_mask.sum(1) + target_mask.sum(1) - inter
+                    iou_score = (inter + 1e-2) / (union + 1e-2)
+                    iou_score = iou_score.unsqueeze(-1)
+                else:
+                    iou_score = None
+            else:
+                iou_score = None
+        else:
+            iou_score = None
+
+        loss = dict()
+        loss.update(
+            self._get_loss_class(
+                logits,
+                gt_class,
+                match_indices,
+                self.num_classes,
+                num_gts,
+                postfix,
+                iou_score,
+                gt_score=target_score if gt_score is not None else None,
+            )
+        )
+        loss.update(
+            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts, postfix)
+        )
+        if masks is not None and gt_mask is not None:
+            loss.update(
+                self._get_loss_mask(masks, gt_mask, match_indices, num_gts, postfix)
+            )
+        return loss
+
+    def forward(
+        self,
+        boxes,
+        logits,
+        gt_bbox,
+        gt_class,
+        masks=None,
+        gt_mask=None,
+        postfix="",
+        gt_score=None,
+        **kwargs
+    ):
+        r"""
+        Args:
+            boxes (Tensor): [l, b, query, 4]
+            logits (Tensor): [l, b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor, optional): [l, b, query, h, w]
+            gt_mask (List(Tensor), optional): list[[n, H, W]]
+            postfix (str): postfix of loss name
+        """
+
+        dn_match_indices = kwargs.get("dn_match_indices", None)
+        num_gts = kwargs.get("num_gts", None)
+        if num_gts is None:
+            num_gts = self._get_num_gts(gt_class)
+
+        total_loss = self._get_prediction_loss(
+            boxes[-1],
+            logits[-1],
+            gt_bbox,
+            gt_class,
+            masks=masks[-1] if masks is not None else None,
+            gt_mask=gt_mask,
+            postfix=postfix,
+            dn_match_indices=dn_match_indices,
+            num_gts=num_gts,
+            gt_score=gt_score if gt_score is not None else None,
+        )
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(
+                    boxes[:-1],
+                    logits[:-1],
+                    gt_bbox,
+                    gt_class,
+                    self.num_classes,
+                    num_gts,
+                    dn_match_indices,
+                    postfix,
+                    masks=masks[:-1] if masks is not None else None,
+                    gt_mask=gt_mask,
+                    gt_score=gt_score if gt_score is not None else None,
+                )
+            )
+
+        return total_loss
+
+
+class DINOLoss(DETRLoss):
+    def forward(
+        self,
+        boxes,
+        logits,
+        gt_bbox,
+        gt_class,
+        masks=None,
+        gt_mask=None,
+        postfix="",
+        dn_out_bboxes=None,
+        dn_out_logits=None,
+        dn_meta=None,
+        gt_score=None,
+        **kwargs
+    ):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(DINOLoss, self).forward(
+            boxes, logits, gt_bbox, gt_class, num_gts=num_gts, gt_score=gt_score
+        )
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = (
+                dn_meta["dn_positive_idx"],
+                dn_meta["dn_num_group"],
+            )
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = self.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group
+            )
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(DINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts,
+                gt_score=gt_score,
+            )
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + "_dn": paddle.to_tensor([0.0]) for k in total_loss.keys()}
+            )
+
+        return total_loss
+
+    @staticmethod
+    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
+        dn_match_indices = []
+        for i in range(len(labels)):
+            num_gt = len(labels[i])
+            if num_gt > 0:
+                gt_idx = paddle.arange(end=num_gt, dtype="int64")
+                gt_idx = gt_idx.tile([dn_num_group])
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append(
+                    (paddle.zeros([0], dtype="int64"), paddle.zeros([0], dtype="int64"))
+                )
+        return dn_match_indices
+
+
+class RTDETRv3Loss(DETRLoss):
+    def forward(
+        self,
+        boxes,
+        logits,
+        gt_bbox,
+        gt_class,
+        masks=None,
+        gt_mask=None,
+        postfix="",
+        dn_out_bboxes=None,
+        dn_out_logits=None,
+        dn_meta=None,
+        gt_score=None,
+        o2m=1,
+        **kwargs
+    ):
+        if o2m != 1:
+            gt_boxes_copy = [box.tile([o2m, 1]) for box in gt_bbox]
+            gt_class_copy = [label.tile([o2m, 1]) for label in gt_class]
+        else:
+            gt_boxes_copy = gt_bbox
+            gt_class_copy = gt_class
+        num_gts_copy = self._get_num_gts(gt_class_copy)
+        total_loss = self._get_prediction_loss(
+            boxes[-1],
+            logits[-1],
+            gt_boxes_copy,
+            gt_class_copy,
+            masks=masks[-1] if masks is not None else None,
+            gt_mask=gt_mask,
+            postfix=postfix,
+            dn_match_indices=None,
+            num_gts=num_gts_copy,
+            gt_score=gt_score if gt_score is not None else None,
+        )
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(
+                    boxes[:-1],
+                    logits[:-1],
+                    gt_boxes_copy,
+                    gt_class_copy,
+                    self.num_classes,
+                    num_gts_copy,
+                    dn_match_indices=None,
+                    postfix=postfix,
+                    masks=masks[:-1] if masks is not None else None,
+                    gt_mask=gt_mask,
+                    gt_score=gt_score if gt_score is not None else None,
+                )
+            )
+
+        if dn_meta is not None:
+            num_gts = self._get_num_gts(gt_class)
+            dn_positive_idx, dn_num_group = (
+                dn_meta["dn_positive_idx"],
+                dn_meta["dn_num_group"],
+            )
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = self.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group
+            )
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(RTDETRv3Loss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts,
+                gt_score=gt_score,
+            )
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + "_dn": paddle.to_tensor([0.0]) for k in total_loss.keys()}
+            )
+
+        return total_loss
+
+    @staticmethod
+    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
+        dn_match_indices = []
+        for i in range(len(labels)):
+            num_gt = len(labels[i])
+            if num_gt > 0:
+                gt_idx = paddle.arange(end=num_gt, dtype="int64")
+                gt_idx = gt_idx.tile([dn_num_group])
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append(
+                    (paddle.zeros([0], dtype="int64"), paddle.zeros([0], dtype="int64"))
+                )
+        return dn_match_indices
+
+
+class MaskDINOLoss(DETRLoss):
+    __shared__ = ["num_classes", "use_focal_loss", "num_sample_points"]
+    __inject__ = ["matcher"]
+
+    def __init__(
+        self,
+        num_classes=80,
+        matcher="HungarianMatcher",
+        loss_coeff={"class": 4, "bbox": 5, "giou": 2, "mask": 5, "dice": 5},
+        aux_loss=True,
+        use_focal_loss=False,
+        use_vfl=False,
+        vfl_iou_type="bbox",
+        num_sample_points=12544,
+        oversample_ratio=3.0,
+        important_sample_ratio=0.75,
+    ):
+        super(MaskDINOLoss, self).__init__(
+            num_classes,
+            matcher,
+            loss_coeff,
+            aux_loss,
+            use_focal_loss,
+            use_vfl,
+            vfl_iou_type,
+        )
+        assert oversample_ratio >= 1
+        assert important_sample_ratio <= 1 and important_sample_ratio >= 0
+
+        self.num_sample_points = num_sample_points
+        self.oversample_ratio = oversample_ratio
+        self.important_sample_ratio = important_sample_ratio
+        self.num_oversample_points = int(num_sample_points * oversample_ratio)
+        self.num_important_points = int(num_sample_points * important_sample_ratio)
+        self.num_random_points = num_sample_points - self.num_important_points
+
+    def forward(
+        self,
+        boxes,
+        logits,
+        gt_bbox,
+        gt_class,
+        masks=None,
+        gt_mask=None,
+        postfix="",
+        dn_out_bboxes=None,
+        dn_out_logits=None,
+        dn_out_masks=None,
+        dn_meta=None,
+        **kwargs
+    ):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(MaskDINOLoss, self).forward(
+            boxes,
+            logits,
+            gt_bbox,
+            gt_class,
+            masks=masks,
+            gt_mask=gt_mask,
+            num_gts=num_gts,
+        )
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = (
+                dn_meta["dn_positive_idx"],
+                dn_meta["dn_num_group"],
+            )
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = DINOLoss.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group
+            )
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(MaskDINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                masks=dn_out_masks,
+                gt_mask=gt_mask,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts,
+            )
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + "_dn": paddle.to_tensor([0.0]) for k in total_loss.keys()}
+            )
+
+        return total_loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.0])
+            loss[name_dice] = paddle.to_tensor([0.0])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(
+            masks, gt_mask, match_indices
+        )
+        # sample points
+        sample_points = self._get_point_coords_by_uncertainty(src_masks)
+        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
+
+        src_masks = F.grid_sample(
+            src_masks.unsqueeze(1), sample_points, align_corners=False
+        ).squeeze([1, 2])
+
+        target_masks = (
+            F.grid_sample(target_masks.unsqueeze(1), sample_points, align_corners=False)
+            .squeeze([1, 2])
+            .detach()
+        )
+
+        loss[name_mask] = (
+            self.loss_coeff["mask"]
+            * F.binary_cross_entropy_with_logits(
+                src_masks, target_masks, reduction="none"
+            )
+            .mean(1)
+            .sum()
+            / num_gts
+        )
+        loss[name_dice] = self.loss_coeff["dice"] * self._dice_loss(
+            src_masks, target_masks, num_gts
+        )
+        return loss
+
+    def _get_point_coords_by_uncertainty(self, masks):
+        # Sample points based on their uncertainty.
+        masks = masks.detach()
+        num_masks = masks.shape[0]
+        sample_points = paddle.rand([num_masks, 1, self.num_oversample_points, 2])
+
+        out_mask = F.grid_sample(
+            masks.unsqueeze(1), 2.0 * sample_points - 1.0, align_corners=False
+        ).squeeze([1, 2])
+        out_mask = -paddle.abs(out_mask)
+
+        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
+        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
+        if self.num_random_points > 0:
+            sample_points = paddle.concat(
+                [sample_points, paddle.rand([num_masks, self.num_random_points, 2])],
+                axis=1,
+            )
+        return sample_points
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/detr_ops.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/detr_ops.py
new file mode 100644
index 0000000000..9b9e73b496
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/detr_ops.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+from collections import namedtuple
+
+import paddle
+import paddle.nn as nn
+
+
+class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    def __new__(cls, channels=None, height=None, width=None, stride=None):
+        return super(ShapeSpec, cls).__new__(cls, channels, height, width, stride)
+
+
+def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
+    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into paddle.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clip(min=0.0, max=1.0)
+    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def get_valid_ratio(mask):
+    _, H, W = mask.shape
+    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
+    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
+    # [b, 2]
+    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+
+
+def get_sine_pos_embed(
+    pos_tensor, num_pos_feats=128, temperature=10000, exchange_xy=True
+):
+    """generate sine position embedding from a position tensor
+
+    Args:
+        pos_tensor (Tensor): Shape as `(None, n)`.
+        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
+        temperature (int): The temperature used for scaling
+            the position embedding. Default: 10000.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is `[x, y]`, the results will  # noqa
+            be `[pos(y), pos(x)]`. Defaults: True.
+
+    Returns:
+        Tensor: Returned position embedding  # noqa
+        with shape `(None, n * num_pos_feats)`.
+    """
+    scale = 2.0 * math.pi
+    dim_t = 2.0 * paddle.floor_divide(paddle.arange(num_pos_feats), paddle.to_tensor(2))
+    dim_t = scale / temperature ** (dim_t / num_pos_feats)
+
+    def sine_func(x):
+        x *= dim_t
+        return paddle.stack((x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(
+            2
+        )
+
+    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = paddle.concat(pos_res, axis=2)
+    return pos_res
+
+
+def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
+    """calculate the iou of box1 and box2
+
+    Args:
+        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        giou (bool): whether use giou or not, default False
+        diou (bool): whether use diou or not, default False
+        ciou (bool): whether use ciou or not, default False
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+    """
+    px1, py1, px2, py2 = box1
+    gx1, gy1, gx2, gy2 = box2
+    x1 = paddle.maximum(px1, gx1)
+    y1 = paddle.maximum(py1, gy1)
+    x2 = paddle.minimum(px2, gx2)
+    y2 = paddle.minimum(py2, gy2)
+
+    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+    area1 = (px2 - px1) * (py2 - py1)
+    area1 = area1.clip(0)
+
+    area2 = (gx2 - gx1) * (gy2 - gy1)
+    area2 = area2.clip(0)
+
+    union = area1 + area2 - overlap + eps
+    iou = overlap / union
+
+    if giou or ciou or diou:
+        # convex w, h
+        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+        if giou:
+            c_area = cw * ch + eps
+            return iou - (c_area - union) / c_area
+        else:
+            # convex diagonal squared
+            c2 = cw**2 + ch**2 + eps
+            # center distance
+            rho2 = ((px1 + px2 - gx1 - gx2) ** 2 + (py1 + py2 - gy1 - gy2) ** 2) / 4
+            if diou:
+                return iou - rho2 / c2
+            else:
+                w1, h1 = px2 - px1, py2 - py1 + eps
+                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+                v = (4 / math.pi**2) * paddle.pow(delta, 2)
+                alpha = v / (1 + eps - iou + v)
+                alpha.stop_gradient = True
+                return iou - (rho2 / c2 + v * alpha)
+    else:
+        return iou
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/initializer.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/initializer.py
new file mode 100644
index 0000000000..edc3642096
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/initializer.py
@@ -0,0 +1,330 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    "uniform_",
+    "normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "linear_init_",
+    "conv_init_",
+    "reset_initialized_parameter",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)
+        )
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError(
+            "Mode {} not supported, please use one of {}".format(mode, valid_modes)
+        )
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        "linear",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif (
+            not isinstance(param, bool)
+            and isinstance(param, int)
+            or isinstance(param, float)
+        ):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(
+    tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False
+):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(
+    tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False
+):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    if hasattr(module, "bias") and module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (
+                m._in_channels * m._kernel_size[0] * m._kernel_size[1]
+            )
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1.0 / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.0)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/layers.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/layers.py
new file mode 100644
index 0000000000..17dfc75a1f
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/layers.py
@@ -0,0 +1,1391 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant, Normal, XavierUniform
+from paddle.regularizer import L2Decay
+from paddle.vision.ops import DeformConv2D
+
+from . import ops
+from .detr_ops import delta2bbox
+from .initializer import constant_, xavier_uniform_
+
+
+def _to_list(l):
+    if isinstance(l, (list, tuple)):
+        return list(l)
+    return [l]
+
+
+class AlignConv(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):
+        super(AlignConv, self).__init__()
+        self.kernel_size = kernel_size
+        self.align_conv = paddle.vision.ops.DeformConv2D(
+            in_channels,
+            out_channels,
+            kernel_size=self.kernel_size,
+            padding=(self.kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
+            bias_attr=None,
+        )
+
+    @paddle.no_grad()
+    def get_offset(self, anchors, featmap_size, stride):
+        """
+        Args:
+            anchors: [B, L, 5] xc,yc,w,h,angle
+            featmap_size: (feat_h, feat_w)
+            stride: 8
+        Returns:
+
+        """
+        batch = anchors.shape[0]
+        dtype = anchors.dtype
+        feat_h, feat_w = featmap_size
+        pad = (self.kernel_size - 1) // 2
+        idx = paddle.arange(-pad, pad + 1, dtype=dtype)
+
+        yy, xx = paddle.meshgrid(idx, idx)
+        xx = paddle.reshape(xx, [-1])
+        yy = paddle.reshape(yy, [-1])
+
+        # get sampling locations of default conv
+        xc = paddle.arange(0, feat_w, dtype=dtype)
+        yc = paddle.arange(0, feat_h, dtype=dtype)
+        yc, xc = paddle.meshgrid(yc, xc)
+
+        xc = paddle.reshape(xc, [-1, 1])
+        yc = paddle.reshape(yc, [-1, 1])
+        x_conv = xc + xx
+        y_conv = yc + yy
+
+        # get sampling locations of anchors
+        x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1)
+        x_ctr = x_ctr / stride
+        y_ctr = y_ctr / stride
+        w_s = w / stride
+        h_s = h / stride
+        cos, sin = paddle.cos(a), paddle.sin(a)
+        dw, dh = w_s / self.kernel_size, h_s / self.kernel_size
+        x, y = dw * xx, dh * yy
+        xr = cos * x - sin * y
+        yr = sin * x + cos * y
+        x_anchor, y_anchor = xr + x_ctr, yr + y_ctr
+        # get offset filed
+        offset_x = x_anchor - x_conv
+        offset_y = y_anchor - y_conv
+        offset = paddle.stack([offset_y, offset_x], axis=-1)
+        offset = offset.reshape(
+            [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2]
+        )
+        offset = offset.transpose([0, 3, 1, 2])
+
+        return offset
+
+    def forward(self, x, refine_anchors, featmap_size, stride):
+        batch = x.shape[0].numpy()
+        offset = self.get_offset(refine_anchors, featmap_size, stride)
+        if self.training:
+            x = F.relu(self.align_conv(x, offset.detach()))
+        else:
+            x = F.relu(self.align_conv(x, offset))
+        return x
+
+
+class DeformableConvV2(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        lr_scale=1,
+        regularizer=None,
+        skip_quant=False,
+        dcn_bias_regularizer=L2Decay(0.0),
+        dcn_bias_lr_scale=2.0,
+    ):
+        super(DeformableConvV2, self).__init__()
+        self.offset_channel = 2 * kernel_size**2
+        self.mask_channel = kernel_size**2
+
+        if lr_scale == 1 and regularizer is None:
+            offset_bias_attr = ParamAttr(initializer=Constant(0.0))
+        else:
+            offset_bias_attr = ParamAttr(
+                initializer=Constant(0.0),
+                learning_rate=lr_scale,
+                regularizer=regularizer,
+            )
+        self.conv_offset = nn.Conv2D(
+            in_channels,
+            3 * kernel_size**2,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            weight_attr=ParamAttr(initializer=Constant(0.0)),
+            bias_attr=offset_bias_attr,
+        )
+        if skip_quant:
+            self.conv_offset.skip_quant = True
+
+        if bias_attr:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            dcn_bias_attr = ParamAttr(
+                initializer=Constant(value=0),
+                regularizer=dcn_bias_regularizer,
+                learning_rate=dcn_bias_lr_scale,
+            )
+        else:
+            # in ResNet backbone, do not need bias
+            dcn_bias_attr = False
+        self.conv_dcn = DeformConv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2 * dilation,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=dcn_bias_attr,
+        )
+
+    def forward(self, x):
+        offset_mask = self.conv_offset(x)
+        offset, mask = paddle.split(
+            offset_mask,
+            num_or_sections=[self.offset_channel, self.mask_channel],
+            axis=1,
+        )
+        mask = F.sigmoid(mask)
+        y = self.conv_dcn(x, offset, mask=mask)
+        return y
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(
+        self,
+        ch_in,
+        ch_out,
+        filter_size,
+        stride,
+        groups=1,
+        norm_type="bn",
+        norm_decay=0.0,
+        norm_groups=32,
+        use_dcn=False,
+        bias_on=False,
+        lr_scale=1.0,
+        freeze_norm=False,
+        initializer=Normal(mean=0.0, std=0.01),
+        skip_quant=False,
+        dcn_lr_scale=2.0,
+        dcn_regularizer=L2Decay(0.0),
+    ):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ["bn", "sync_bn", "gn", None]
+
+        if bias_on:
+            bias_attr = ParamAttr(
+                initializer=Constant(value=0.0), learning_rate=lr_scale
+            )
+        else:
+            bias_attr = False
+
+        if not use_dcn:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(initializer=initializer, learning_rate=1.0),
+                bias_attr=bias_attr,
+            )
+            if skip_quant:
+                self.conv.skip_quant = True
+        else:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            self.conv = DeformableConvV2(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(initializer=initializer, learning_rate=1.0),
+                bias_attr=True,
+                lr_scale=dcn_lr_scale,
+                regularizer=dcn_regularizer,
+                dcn_bias_regularizer=dcn_regularizer,
+                dcn_bias_lr_scale=dcn_lr_scale,
+                skip_quant=skip_quant,
+            )
+
+        norm_lr = 0.0 if freeze_norm else 1.0
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None,
+        )
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None,
+        )
+        if norm_type in ["bn", "sync_bn"]:
+            self.norm = nn.BatchNorm2D(
+                ch_out, weight_attr=param_attr, bias_attr=bias_attr
+            )
+        elif norm_type == "gn":
+            self.norm = nn.GroupNorm(
+                num_groups=norm_groups,
+                num_channels=ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+            )
+        else:
+            self.norm = None
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        if self.norm is not None:
+            out = self.norm(out)
+        return out
+
+
+class LiteConv(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride=1,
+        with_act=True,
+        norm_type="sync_bn",
+        name=None,
+    ):
+        super(LiteConv, self).__init__()
+        self.lite_conv = nn.Sequential()
+        conv1 = ConvNormLayer(
+            in_channels,
+            in_channels,
+            filter_size=5,
+            stride=stride,
+            groups=in_channels,
+            norm_type=norm_type,
+            initializer=XavierUniform(),
+        )
+        conv2 = ConvNormLayer(
+            in_channels,
+            out_channels,
+            filter_size=1,
+            stride=stride,
+            norm_type=norm_type,
+            initializer=XavierUniform(),
+        )
+        conv3 = ConvNormLayer(
+            out_channels,
+            out_channels,
+            filter_size=1,
+            stride=stride,
+            norm_type=norm_type,
+            initializer=XavierUniform(),
+        )
+        conv4 = ConvNormLayer(
+            out_channels,
+            out_channels,
+            filter_size=5,
+            stride=stride,
+            groups=out_channels,
+            norm_type=norm_type,
+            initializer=XavierUniform(),
+        )
+        conv_list = [conv1, conv2, conv3, conv4]
+        self.lite_conv.add_sublayer("conv1", conv1)
+        self.lite_conv.add_sublayer("relu6_1", nn.ReLU6())
+        self.lite_conv.add_sublayer("conv2", conv2)
+        if with_act:
+            self.lite_conv.add_sublayer("relu6_2", nn.ReLU6())
+        self.lite_conv.add_sublayer("conv3", conv3)
+        self.lite_conv.add_sublayer("relu6_3", nn.ReLU6())
+        self.lite_conv.add_sublayer("conv4", conv4)
+        if with_act:
+            self.lite_conv.add_sublayer("relu6_4", nn.ReLU6())
+
+    def forward(self, inputs):
+        out = self.lite_conv(inputs)
+        return out
+
+
+class DropBlock(nn.Layer):
+    def __init__(self, block_size, keep_prob, name=None, data_format="NCHW"):
+        """
+        DropBlock layer, see https://arxiv.org/abs/1810.12890
+
+        Args:
+            block_size (int): block size
+            keep_prob (int): keep probability
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DropBlock, self).__init__()
+        self.block_size = block_size
+        self.keep_prob = keep_prob
+        self.name = name
+        self.data_format = data_format
+
+    def forward(self, x):
+        if not self.training or self.keep_prob == 1:
+            return x
+        else:
+            gamma = (1.0 - self.keep_prob) / (self.block_size**2)
+            if self.data_format == "NCHW":
+                shape = x.shape[2:]
+            else:
+                shape = x.shape[1:3]
+            for s in shape:
+                gamma *= s / (s - self.block_size + 1)
+
+            matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)
+            mask_inv = F.max_pool2d(
+                matrix,
+                self.block_size,
+                stride=1,
+                padding=self.block_size // 2,
+                data_format=self.data_format,
+            )
+            mask = 1.0 - mask_inv
+            mask = mask.astype("float32")
+            x = x.astype("float32")
+            y = x * mask * (mask.numel() / mask.sum())
+            return y
+
+
+class AnchorGeneratorSSD(object):
+    def __init__(
+        self,
+        steps=[8, 16, 32, 64, 100, 300],
+        aspect_ratios=[[2.0], [2.0, 3.0], [2.0, 3.0], [2.0, 3.0], [2.0], [2.0]],
+        min_ratio=15,
+        max_ratio=90,
+        base_size=300,
+        min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0],
+        max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0],
+        offset=0.5,
+        flip=True,
+        clip=False,
+        min_max_aspect_ratios_order=False,
+    ):
+        self.steps = steps
+        self.aspect_ratios = aspect_ratios
+        self.min_ratio = min_ratio
+        self.max_ratio = max_ratio
+        self.base_size = base_size
+        self.min_sizes = min_sizes
+        self.max_sizes = max_sizes
+        self.offset = offset
+        self.flip = flip
+        self.clip = clip
+        self.min_max_aspect_ratios_order = min_max_aspect_ratios_order
+
+        if self.min_sizes == [] and self.max_sizes == []:
+            num_layer = len(aspect_ratios)
+            step = int(
+                math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2))
+            )
+            for ratio in range(self.min_ratio, self.max_ratio + 1, step):
+                self.min_sizes.append(self.base_size * ratio / 100.0)
+                self.max_sizes.append(self.base_size * (ratio + step) / 100.0)
+            self.min_sizes = [self.base_size * 0.10] + self.min_sizes
+            self.max_sizes = [self.base_size * 0.20] + self.max_sizes
+
+        self.num_priors = []
+        for aspect_ratio, min_size, max_size in zip(
+            aspect_ratios, self.min_sizes, self.max_sizes
+        ):
+            if isinstance(min_size, (list, tuple)):
+                self.num_priors.append(
+                    len(_to_list(min_size)) + len(_to_list(max_size))
+                )
+            else:
+                self.num_priors.append(
+                    (len(aspect_ratio) * 2 + 1) * len(_to_list(min_size))
+                    + len(_to_list(max_size))
+                )
+
+    def __call__(self, inputs, image):
+        boxes = []
+        for input, min_size, max_size, aspect_ratio, step in zip(
+            inputs, self.min_sizes, self.max_sizes, self.aspect_ratios, self.steps
+        ):
+            box, _ = ops.prior_box(
+                input=input,
+                image=image,
+                min_sizes=_to_list(min_size),
+                max_sizes=_to_list(max_size),
+                aspect_ratios=aspect_ratio,
+                flip=self.flip,
+                clip=self.clip,
+                steps=[step, step],
+                offset=self.offset,
+                min_max_aspect_ratios_order=self.min_max_aspect_ratios_order,
+            )
+            boxes.append(paddle.reshape(box, [-1, 4]))
+        return boxes
+
+
+class RCNNBox(object):
+    __shared__ = ["num_classes", "export_onnx"]
+
+    def __init__(
+        self,
+        prior_box_var=[10.0, 10.0, 5.0, 5.0],
+        code_type="decode_center_size",
+        box_normalized=False,
+        num_classes=80,
+        export_onnx=False,
+    ):
+        super(RCNNBox, self).__init__()
+        self.prior_box_var = prior_box_var
+        self.code_type = code_type
+        self.box_normalized = box_normalized
+        self.num_classes = num_classes
+        self.export_onnx = export_onnx
+
+    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
+        bbox_pred = bbox_head_out[0]
+        cls_prob = bbox_head_out[1]
+        roi = rois[0]
+        rois_num = rois[1]
+
+        if self.export_onnx:
+            onnx_rois_num_per_im = rois_num[0]
+            origin_shape = paddle.expand(im_shape[0, :], [onnx_rois_num_per_im, 2])
+
+        else:
+            origin_shape_list = []
+            if isinstance(roi, list):
+                batch_size = len(roi)
+            else:
+                batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
+
+            # bbox_pred.shape: [N, C*4]
+            for idx in range(batch_size):
+                rois_num_per_im = rois_num[idx]
+                expand_im_shape = paddle.expand(im_shape[idx, :], [rois_num_per_im, 2])
+                origin_shape_list.append(expand_im_shape)
+
+            origin_shape = paddle.concat(origin_shape_list)
+
+        # bbox_pred.shape: [N, C*4]
+        # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)
+        bbox = paddle.concat(roi)
+        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
+        scores = cls_prob[:, :-1]
+
+        # bbox.shape: [N, C, 4]
+        # bbox.shape[1] must be equal to scores.shape[1]
+        total_num = bbox.shape[0]
+        bbox_dim = bbox.shape[-1]
+        bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])
+
+        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
+        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
+        zeros = paddle.zeros_like(origin_h)
+        x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)
+        y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)
+        x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)
+        y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)
+        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
+        bboxes = (bbox, rois_num)
+        return bboxes, scores
+
+
+class MultiClassNMS(object):
+    def __init__(
+        self,
+        score_threshold=0.05,
+        nms_top_k=-1,
+        keep_top_k=100,
+        nms_threshold=0.5,
+        normalized=True,
+        nms_eta=1.0,
+        return_index=False,
+        return_rois_num=True,
+        trt=False,
+        cpu=False,
+    ):
+        super(MultiClassNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+        self.nms_threshold = nms_threshold
+        self.normalized = normalized
+        self.nms_eta = nms_eta
+        self.return_index = return_index
+        self.return_rois_num = return_rois_num
+        self.trt = trt
+        self.cpu = cpu
+
+    def __call__(self, bboxes, score, background_label=-1):
+        """
+        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape
+                                         [N, M, 4], N is the batch size and M
+                                         is the number of bboxes
+                                      2. (List[Tensor]) bboxes and bbox_num,
+                                         bboxes have shape of [M, C, 4], C
+                                         is the class number and bbox_num means
+                                         the number of bboxes of each batch with
+                                         shape [N,]
+        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
+        background_label (int): Ignore the background label; For example, RCNN
+                                is num_classes and YOLO is -1.
+        """
+        kwargs = self.__dict__.copy()
+        if isinstance(bboxes, tuple):
+            bboxes, bbox_num = bboxes
+            kwargs.update({"rois_num": bbox_num})
+        if background_label > -1:
+            kwargs.update({"background_label": background_label})
+        kwargs.pop("trt")
+        kwargs.pop("cpu")
+
+        # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt
+        if self.trt and (
+            int(paddle.version.major) == 0
+            or (int(paddle.version.major) >= 2 and int(paddle.version.minor) >= 3)
+        ):
+            # TODO(wangxinxin08): tricky switch to run nms on tensorrt
+            kwargs.update({"nms_eta": 1.1})
+            bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)
+            bbox = bbox.reshape([1, -1, 6])
+            idx = paddle.nonzero(bbox[..., 0] != -1)
+            bbox = paddle.gather_nd(bbox, idx)
+            return bbox, bbox_num, None
+        else:
+            if self.cpu:
+                device = paddle.device.get_device()
+                paddle.set_device("cpu")
+                outputs = ops.multiclass_nms(bboxes, score, **kwargs)
+                paddle.set_device(device)
+                return outputs
+            else:
+                return ops.multiclass_nms(bboxes, score, **kwargs)
+
+
+class MatrixNMS(object):
+    __append_doc__ = True
+
+    def __init__(
+        self,
+        score_threshold=0.05,
+        post_threshold=0.05,
+        nms_top_k=-1,
+        keep_top_k=100,
+        use_gaussian=False,
+        gaussian_sigma=2.0,
+        normalized=False,
+        background_label=0,
+    ):
+        super(MatrixNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.post_threshold = post_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+        self.normalized = normalized
+        self.use_gaussian = use_gaussian
+        self.gaussian_sigma = gaussian_sigma
+        self.background_label = background_label
+
+    def __call__(self, bbox, score, *args):
+        return ops.matrix_nms(
+            bboxes=bbox,
+            scores=score,
+            score_threshold=self.score_threshold,
+            post_threshold=self.post_threshold,
+            nms_top_k=self.nms_top_k,
+            keep_top_k=self.keep_top_k,
+            use_gaussian=self.use_gaussian,
+            gaussian_sigma=self.gaussian_sigma,
+            background_label=self.background_label,
+            normalized=self.normalized,
+        )
+
+
+class YOLOBox(object):
+    __shared__ = ["num_classes"]
+
+    def __init__(
+        self,
+        num_classes=80,
+        conf_thresh=0.005,
+        downsample_ratio=32,
+        clip_bbox=True,
+        scale_x_y=1.0,
+    ):
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.downsample_ratio = downsample_ratio
+        self.clip_bbox = clip_bbox
+        self.scale_x_y = scale_x_y
+
+    def __call__(self, yolo_head_out, anchors, im_shape, scale_factor, var_weight=None):
+        boxes_list = []
+        scores_list = []
+        origin_shape = im_shape / scale_factor
+        origin_shape = paddle.cast(origin_shape, "int32")
+        for i, head_out in enumerate(yolo_head_out):
+            boxes, scores = paddle.vision.ops.yolo_box(
+                head_out,
+                origin_shape,
+                anchors[i],
+                self.num_classes,
+                self.conf_thresh,
+                self.downsample_ratio // 2**i,
+                self.clip_bbox,
+                scale_x_y=self.scale_x_y,
+            )
+            boxes_list.append(boxes)
+            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
+        yolo_boxes = paddle.concat(boxes_list, axis=1)
+        yolo_scores = paddle.concat(scores_list, axis=2)
+        return yolo_boxes, yolo_scores
+
+
+class SSDBox(object):
+    def __init__(
+        self,
+        is_normalized=True,
+        prior_box_var=[0.1, 0.1, 0.2, 0.2],
+        use_fuse_decode=False,
+    ):
+        self.is_normalized = is_normalized
+        self.norm_delta = float(not self.is_normalized)
+        self.prior_box_var = prior_box_var
+        self.use_fuse_decode = use_fuse_decode
+
+    def __call__(self, preds, prior_boxes, im_shape, scale_factor, var_weight=None):
+        boxes, scores = preds
+        boxes = paddle.concat(boxes, axis=1)
+        prior_boxes = paddle.concat(prior_boxes)
+        if self.use_fuse_decode:
+            output_boxes = ops.box_coder(
+                prior_boxes,
+                self.prior_box_var,
+                boxes,
+                code_type="decode_center_size",
+                box_normalized=self.is_normalized,
+            )
+        else:
+            pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta
+            pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta
+            pb_x = prior_boxes[:, 0] + pb_w * 0.5
+            pb_y = prior_boxes[:, 1] + pb_h * 0.5
+            out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0]
+            out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1]
+            out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w
+            out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h
+            output_boxes = paddle.stack(
+                [
+                    out_x - out_w / 2.0,
+                    out_y - out_h / 2.0,
+                    out_x + out_w / 2.0,
+                    out_y + out_h / 2.0,
+                ],
+                axis=-1,
+            )
+
+        if self.is_normalized:
+            h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1)
+            w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1)
+            im_shape = paddle.stack([w, h, w, h], axis=-1)
+            output_boxes *= im_shape
+        else:
+            output_boxes[..., -2:] -= 1.0
+        output_scores = F.softmax(paddle.concat(scores, axis=1)).transpose([0, 2, 1])
+
+        return output_boxes, output_scores
+
+
+class TTFBox(object):
+    __shared__ = ["down_ratio"]
+
+    def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4):
+        super(TTFBox, self).__init__()
+        self.max_per_img = max_per_img
+        self.score_thresh = score_thresh
+        self.down_ratio = down_ratio
+
+    def _simple_nms(self, heat, kernel=3):
+        """
+        Use maxpool to filter the max score, get local peaks.
+        """
+        pad = (kernel - 1) // 2
+        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+        keep = paddle.cast(hmax == heat, "float32")
+        return heat * keep
+
+    def _topk(self, scores):
+        """
+        Select top k scores and decode to get xy coordinates.
+        """
+        k = self.max_per_img
+        shape_fm = paddle.shape(scores)
+        shape_fm.stop_gradient = True
+        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
+        # batch size is 1
+        scores_r = paddle.reshape(scores, [cat, -1])
+        topk_scores, topk_inds = paddle.topk(scores_r, k)
+        topk_ys = topk_inds // width
+        topk_xs = topk_inds % width
+
+        topk_score_r = paddle.reshape(topk_scores, [-1])
+        topk_score, topk_ind = paddle.topk(topk_score_r, k)
+        k_t = paddle.full(topk_ind.shape, k, dtype="int64")
+        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), "float32")
+
+        topk_inds = paddle.reshape(topk_inds, [-1])
+        topk_ys = paddle.reshape(topk_ys, [-1, 1])
+        topk_xs = paddle.reshape(topk_xs, [-1, 1])
+        topk_inds = paddle.gather(topk_inds, topk_ind)
+        topk_ys = paddle.gather(topk_ys, topk_ind)
+        topk_xs = paddle.gather(topk_xs, topk_ind)
+
+        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+    def _decode(self, hm, wh, im_shape, scale_factor):
+        heatmap = F.sigmoid(hm)
+        heat = self._simple_nms(heatmap)
+        scores, inds, clses, ys, xs = self._topk(heat)
+        ys = paddle.cast(ys, "float32") * self.down_ratio
+        xs = paddle.cast(xs, "float32") * self.down_ratio
+        scores = paddle.tensor.unsqueeze(scores, [1])
+        clses = paddle.tensor.unsqueeze(clses, [1])
+
+        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
+        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])
+        wh = paddle.gather(wh, inds)
+
+        x1 = xs - wh[:, 0:1]
+        y1 = ys - wh[:, 1:2]
+        x2 = xs + wh[:, 2:3]
+        y2 = ys + wh[:, 3:4]
+
+        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
+
+        scale_y = scale_factor[:, 0:1]
+        scale_x = scale_factor[:, 1:2]
+        scale_expand = paddle.concat([scale_x, scale_y, scale_x, scale_y], axis=1)
+        boxes_shape = paddle.shape(bboxes)
+        boxes_shape.stop_gradient = True
+        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
+        bboxes = paddle.divide(bboxes, scale_expand)
+        results = paddle.concat([clses, scores, bboxes], axis=1)
+        # hack: append result with cls=-1 and score=1. to avoid all scores
+        # are less than score_thresh which may cause error in gather.
+        fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]]))
+        fill_r = paddle.cast(fill_r, results.dtype)
+        results = paddle.concat([results, fill_r])
+        scores = results[:, 1]
+        valid_ind = paddle.nonzero(scores > self.score_thresh)
+        results = paddle.gather(results, valid_ind)
+        return results, results.shape[0:1]
+
+    def __call__(self, hm, wh, im_shape, scale_factor):
+        results = []
+        results_num = []
+        for i in range(scale_factor.shape[0]):
+            result, num = self._decode(
+                hm[i : i + 1,],
+                wh[i : i + 1,],
+                im_shape[i : i + 1,],
+                scale_factor[i : i + 1,],
+            )
+            results.append(result)
+            results_num.append(num)
+        results = paddle.concat(results, axis=0)
+        results_num = paddle.concat(results_num, axis=0)
+        return results, results_num
+
+
+class JDEBox(object):
+    __shared__ = ["num_classes"]
+
+    def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.downsample_ratio = downsample_ratio
+
+    def generate_anchor(self, nGh, nGw, anchor_wh):
+        nA = len(anchor_wh)
+        yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])
+        mesh = paddle.stack((xv, yv), axis=0).cast(dtype="float32")  # 2 x nGh x nGw
+        meshs = paddle.tile(mesh, [nA, 1, 1, 1])
+
+        anchor_offset_mesh = (
+            anchor_wh[:, :, None][:, :, :, None]
+            .repeat(int(nGh), axis=-2)
+            .repeat(int(nGw), axis=-1)
+        )
+        anchor_offset_mesh = paddle.to_tensor(anchor_offset_mesh.astype(np.float32))
+        # nA x 2 x nGh x nGw
+
+        anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)
+        anchor_mesh = paddle.transpose(
+            anchor_mesh, [0, 2, 3, 1]
+        )  # (nA x nGh x nGw) x 4
+        return anchor_mesh
+
+    def decode_delta(self, delta, fg_anchor_list):
+        px, py, pw, ph = (
+            fg_anchor_list[:, 0],
+            fg_anchor_list[:, 1],
+            fg_anchor_list[:, 2],
+            fg_anchor_list[:, 3],
+        )
+        dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
+        gx = pw * dx + px
+        gy = ph * dy + py
+        gw = pw * paddle.exp(dw)
+        gh = ph * paddle.exp(dh)
+        gx1 = gx - gw * 0.5
+        gy1 = gy - gh * 0.5
+        gx2 = gx + gw * 0.5
+        gy2 = gy + gh * 0.5
+        return paddle.stack([gx1, gy1, gx2, gy2], axis=1)
+
+    def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec):
+        anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec)
+        anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)
+        pred_list = self.decode_delta(
+            paddle.reshape(delta_map, shape=[-1, 4]),
+            paddle.reshape(anchor_mesh, shape=[-1, 4]),
+        )
+        pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4])
+        return pred_map
+
+    def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec):
+        boxes_shape = head_out.shape  # [nB, nA*6, nGh, nGw]
+        nGh, nGw = boxes_shape[-2], boxes_shape[-1]
+        nB = 1  # TODO: only support bs=1 now
+        boxes_list, scores_list = [], []
+        for idx in range(nB):
+            p = paddle.reshape(
+                head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw]
+            )
+            p = paddle.transpose(p, perm=[0, 2, 3, 1])  # [nA, nGh, nGw, 6]
+            delta_map = p[:, :, :, :4]
+            boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec)
+            # [nA * nGh * nGw, 4]
+            boxes_list.append(boxes * stride)
+
+            p_conf = paddle.transpose(
+                p[:, :, :, 4:6], perm=[3, 0, 1, 2]
+            )  # [2, nA, nGh, nGw]
+            p_conf = F.softmax(p_conf, axis=0)[1, :, :, :].unsqueeze(
+                -1
+            )  # [nA, nGh, nGw, 1]
+            scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1])
+            scores_list.append(scores)
+
+        boxes_results = paddle.stack(boxes_list)
+        scores_results = paddle.stack(scores_list)
+        return boxes_results, scores_results
+
+    def __call__(self, yolo_head_out, anchors):
+        bbox_pred_list = []
+        for i, head_out in enumerate(yolo_head_out):
+            stride = self.downsample_ratio // 2**i
+            anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]
+            anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride
+            nA = len(anc_w)
+            boxes, scores = self._postprocessing_by_level(
+                nA, stride, head_out, anchor_vec
+            )
+            bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))
+
+        yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1)
+        boxes_idx_over_conf_thr = paddle.nonzero(
+            yolo_boxes_scores[:, :, -1] > self.conf_thresh
+        )
+        boxes_idx_over_conf_thr.stop_gradient = True
+
+        return boxes_idx_over_conf_thr, yolo_boxes_scores
+
+
+class MaskMatrixNMS(object):
+    """
+    Matrix NMS for multi-class masks.
+    Args:
+        update_threshold (float): Updated threshold of categroy score in second time.
+        pre_nms_top_n (int): Number of total instance to be kept per image before NMS
+        post_nms_top_n (int): Number of total instance to be kept per image after NMS.
+        kernel (str):  'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+    Input:
+        seg_preds (Variable): shape (n, h, w), segmentation feature maps
+        seg_masks (Variable): shape (n, h, w), segmentation feature maps
+        cate_labels (Variable): shape (n), mask labels in descending order
+        cate_scores (Variable): shape (n), mask scores in descending order
+        sum_masks (Variable): a float tensor of the sum of seg_masks
+    Returns:
+        Variable: cate_scores, tensors of shape (n)
+    """
+
+    def __init__(
+        self,
+        update_threshold=0.05,
+        pre_nms_top_n=500,
+        post_nms_top_n=100,
+        kernel="gaussian",
+        sigma=2.0,
+    ):
+        super(MaskMatrixNMS, self).__init__()
+        self.update_threshold = update_threshold
+        self.pre_nms_top_n = pre_nms_top_n
+        self.post_nms_top_n = post_nms_top_n
+        self.kernel = kernel
+        self.sigma = sigma
+
+    def _sort_score(self, scores, top_num):
+        if scores.shape[0] > top_num:
+            return paddle.topk(scores, top_num)[1]
+        else:
+            return paddle.argsort(scores, descending=True)
+
+    def __call__(self, seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=None):
+        # sort and keep top nms_pre
+        sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n)
+        seg_masks = paddle.gather(seg_masks, index=sort_inds)
+        seg_preds = paddle.gather(seg_preds, index=sort_inds)
+        sum_masks = paddle.gather(sum_masks, index=sort_inds)
+        cate_scores = paddle.gather(cate_scores, index=sort_inds)
+        cate_labels = paddle.gather(cate_labels, index=sort_inds)
+
+        seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1)
+        # inter.
+        inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0]))
+        n_samples = cate_labels.shape
+        n_samples = paddle.to_tensor(n_samples, dtype="int32")
+        # union.
+        sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples])
+        # iou.
+        iou_matrix = inter_matrix / (
+            sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix
+        )
+        iou_matrix = paddle.triu(iou_matrix, diagonal=1)
+        # label_specific matrix.
+        cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples])
+        label_matrix = paddle.cast(
+            (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), "float32"
+        )
+        label_matrix = paddle.triu(label_matrix, diagonal=1)
+
+        # IoU compensation
+        compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0)
+        compensate_iou = paddle.expand(compensate_iou, shape=[n_samples, n_samples])
+        compensate_iou = paddle.transpose(compensate_iou, [1, 0])
+
+        # IoU decay
+        decay_iou = iou_matrix * label_matrix
+
+        # matrix nms
+        if self.kernel == "gaussian":
+            decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2))
+            compensate_matrix = paddle.exp(-1 * self.sigma * (compensate_iou**2))
+            decay_coefficient = paddle.min(decay_matrix / compensate_matrix, axis=0)
+        elif self.kernel == "linear":
+            decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+            decay_coefficient = paddle.min(decay_matrix, axis=0)
+        else:
+            raise NotImplementedError
+
+        # update the score.
+        cate_scores = cate_scores * decay_coefficient
+        y = paddle.zeros(shape=cate_scores.shape, dtype="float32")
+        keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, y)
+        keep = paddle.nonzero(keep)
+        keep = paddle.squeeze(keep, axis=[1])
+        # Prevent empty and increase fake data
+        keep = paddle.concat(
+            [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, "int64")]
+        )
+
+        seg_preds = paddle.gather(seg_preds, index=keep)
+        cate_scores = paddle.gather(cate_scores, index=keep)
+        cate_labels = paddle.gather(cate_labels, index=keep)
+
+        # sort and keep top_k
+        sort_inds = self._sort_score(cate_scores, self.post_nms_top_n)
+        seg_preds = paddle.gather(seg_preds, index=sort_inds)
+        cate_scores = paddle.gather(cate_scores, index=sort_inds)
+        cate_labels = paddle.gather(cate_labels, index=sort_inds)
+        return seg_preds, cate_scores, cate_labels
+
+
+def Conv2d(
+    in_channels,
+    out_channels,
+    kernel_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    bias=True,
+    weight_init=Normal(std=0.001),
+    bias_init=Constant(0.0),
+):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2D(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr,
+    )
+    return conv
+
+
+def ConvTranspose2d(
+    in_channels,
+    out_channels,
+    kernel_size,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    bias=True,
+    dilation=1,
+    weight_init=Normal(std=0.001),
+    bias_init=Constant(0.0),
+):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2DTranspose(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr,
+    )
+    return conv
+
+
+def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):
+    if not affine:
+        weight_attr = False
+        bias_attr = False
+    else:
+        weight_attr = None
+        bias_attr = None
+    batchnorm = nn.BatchNorm2D(
+        num_features, momentum, eps, weight_attr=weight_attr, bias_attr=bias_attr
+    )
+    return batchnorm
+
+
+def ReLU():
+    return nn.ReLU()
+
+
+def Upsample(scale_factor=None, mode="nearest", align_corners=False):
+    return nn.Upsample(None, scale_factor, mode, align_corners)
+
+
+def MaxPool(kernel_size, stride, padding, ceil_mode=False):
+    return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)
+
+
+class Concat(nn.Layer):
+    def __init__(self, dim=0):
+        super(Concat, self).__init__()
+        self.dim = dim
+
+    def forward(self, inputs):
+        return paddle.concat(inputs, axis=self.dim)
+
+    def extra_repr(self):
+        return "dim={}".format(self.dim)
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    """
+    Convert the attention mask to the target dtype we expect.
+    Parameters:
+        attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+        dtype (VarType): The target type of `attn_mask` we expect.
+    Returns:
+        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
+    """
+    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)
+
+
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+    ):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim:
+            self.in_proj_weight = self.create_parameter(
+                shape=[embed_dim, 3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=False,
+            )
+            self.in_proj_bias = self.create_parameter(
+                shape=[3 * embed_dim], attr=None, dtype=self._dtype, is_bias=True
+            )
+        else:
+            self.q_proj = nn.Linear(embed_dim, embed_dim)
+            self.k_proj = nn.Linear(self.kdim, embed_dim)
+            self.v_proj = nn.Linear(self.vdim, embed_dim)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self._type_list = ("q_proj", "k_proj", "v_proj")
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                constant_(p)
+
+    def compute_qkv(self, tensor, index):
+        if self._qkv_same_embed_dim:
+            tensor = F.linear(
+                x=tensor,
+                weight=self.in_proj_weight[
+                    :, index * self.embed_dim : (index + 1) * self.embed_dim
+                ],
+                bias=(
+                    self.in_proj_bias[
+                        index * self.embed_dim : (index + 1) * self.embed_dim
+                    ]
+                    if self.in_proj_bias is not None
+                    else None
+                ),
+            )
+        else:
+            tensor = getattr(self, self._type_list[index])(tensor)
+        tensor = tensor.reshape([0, 0, self.num_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
+        return tensor
+
+    def forward(self, query, key=None, value=None, attn_mask=None):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        q, k, v = (self.compute_qkv(t, i) for i, t in enumerate([query, key, value]))
+
+        # scale dot product attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        scaling = float(self.head_dim) ** -0.5
+        product = product * scaling
+
+        if attn_mask is not None:
+            # Support bool or int mask
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if self.dropout:
+            weights = F.dropout(
+                weights, self.dropout, training=self.training, mode="upscale_in_train"
+            )
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class ConvMixer(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        depth,
+        kernel_size=3,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.depth = depth
+        self.kernel_size = kernel_size
+
+        self.mixer = self.conv_mixer(dim, depth, kernel_size)
+
+    def forward(self, x):
+        return self.mixer(x)
+
+    @staticmethod
+    def conv_mixer(
+        dim,
+        depth,
+        kernel_size,
+    ):
+        Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))
+        Residual = type("Residual", (Seq,), {"forward": lambda self, x: self[0](x) + x})
+        return Seq(
+            *[
+                Seq(
+                    Residual(
+                        ActBn(
+                            nn.Conv2D(dim, dim, kernel_size, groups=dim, padding="same")
+                        )
+                    ),
+                    ActBn(nn.Conv2D(dim, dim, 1)),
+                )
+                for i in range(depth)
+            ]
+        )
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/matchers.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/matchers.py
new file mode 100644
index 0000000000..7ded61b064
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/matchers.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+
+from .utils import bbox_cxcywh_to_xyxy
+
+__all__ = ["HungarianMatcher"]
+
+
+class GIoULoss(object):
+    """
+    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        reduction (string): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1.0, eps=1e-10, reduction="none"):
+        self.loss_weight = loss_weight
+        self.eps = eps
+        assert reduction in ("none", "mean", "sum")
+        self.reduction = reduction
+
+    def bbox_overlap(self, box1, box2, eps=1e-10):
+        """calculate the iou of box1 and box2
+        Args:
+            box1 (Tensor): box1 with the shape (..., 4)
+            box2 (Tensor): box1 with the shape (..., 4)
+            eps (float): epsilon to avoid divide by zero
+        Return:
+            iou (Tensor): iou of box1 and box2
+            overlap (Tensor): overlap of box1 and box2
+            union (Tensor): union of box1 and box2
+        """
+        x1, y1, x2, y2 = box1
+        x1g, y1g, x2g, y2g = box2
+
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+        w_inter = (xkis2 - xkis1).clip(0)
+        h_inter = (ykis2 - ykis1).clip(0)
+        overlap = w_inter * h_inter
+
+        area1 = (x2 - x1) * (y2 - y1)
+        area2 = (x2g - x1g) * (y2g - y1g)
+        union = area1 + area2 - overlap + eps
+        iou = overlap / union
+
+        return iou, overlap, union
+
+    def __call__(self, pbox, gbox, iou_weight=1.0, loc_reweight=None):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
+        miou = iou - ((area_c - union) / area_c)
+        if loc_reweight is not None:
+            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
+            loc_thresh = 0.9
+            giou = 1 - (1 - loc_thresh) * miou - loc_thresh * miou * loc_reweight
+        else:
+            giou = 1 - miou
+        if self.reduction == "none":
+            loss = giou
+        elif self.reduction == "sum":
+            loss = paddle.sum(giou * iou_weight)
+        else:
+            loss = paddle.mean(giou * iou_weight)
+        return loss * self.loss_weight
+
+
+class HungarianMatcher(nn.Layer):
+    __shared__ = ["use_focal_loss", "with_mask", "num_sample_points"]
+
+    def __init__(
+        self,
+        matcher_coeff={"class": 1, "bbox": 5, "giou": 2, "mask": 1, "dice": 1},
+        use_focal_loss=False,
+        with_mask=False,
+        num_sample_points=12544,
+        alpha=0.25,
+        gamma=2.0,
+    ):
+        r"""
+        Args:
+            matcher_coeff (dict): The coefficient of hungarian matcher cost.
+        """
+        super(HungarianMatcher, self).__init__()
+        self.matcher_coeff = matcher_coeff
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.num_sample_points = num_sample_points
+        self.alpha = alpha
+        self.gamma = gamma
+
+        self.giou_loss = GIoULoss()
+
+    def forward(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None):
+        r"""
+        Args:
+            boxes (Tensor): [b, query, 4]
+            logits (Tensor): [b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor|None): [b, query, h, w]
+            gt_mask (List(Tensor)): list[[n, H, W]]
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = boxes.shape[:2]
+
+        num_gts = [len(a) for a in gt_class]
+        if sum(num_gts) == 0:
+            return [
+                (
+                    paddle.to_tensor([], dtype=paddle.int64),
+                    paddle.to_tensor([], dtype=paddle.int64),
+                )
+                for _ in range(bs)
+            ]
+
+        # We flatten to compute the cost matrices in a batch
+        # [batch_size * num_queries, num_classes]
+        logits = logits.detach()
+        out_prob = (
+            F.sigmoid(logits.flatten(0, 1))
+            if self.use_focal_loss
+            else F.softmax(logits.flatten(0, 1))
+        )
+        # [batch_size * num_queries, 4]
+        out_bbox = boxes.detach().flatten(0, 1)
+
+        # Also concat the target labels and boxes
+        if "npu" in paddle.device.get_device():
+            gt_class = [tensor.to(paddle.int32) for tensor in gt_class]
+
+        tgt_ids = paddle.concat(gt_class).flatten()
+        tgt_bbox = paddle.concat(gt_bbox)
+
+        # Compute the classification cost
+        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
+        if self.use_focal_loss:
+            neg_cost_class = (
+                (1 - self.alpha)
+                * (out_prob**self.gamma)
+                * (-(1 - out_prob + 1e-8).log())
+            )
+            pos_cost_class = (
+                self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log())
+            )
+            cost_class = pos_cost_class - neg_cost_class
+        else:
+            cost_class = -out_prob
+
+        # Compute the L1 cost between boxes
+        cost_bbox = (out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
+
+        # Compute the giou cost betwen boxes
+        giou_loss = self.giou_loss(
+            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
+            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0)),
+        ).squeeze(-1)
+        cost_giou = giou_loss - 1
+
+        # Final cost matrix
+        C = (
+            self.matcher_coeff["class"] * cost_class
+            + self.matcher_coeff["bbox"] * cost_bbox
+            + self.matcher_coeff["giou"] * cost_giou
+        )
+        # Compute the mask cost and dice cost
+        if self.with_mask:
+            assert (
+                masks is not None and gt_mask is not None,
+                "Make sure the input has `mask` and `gt_mask`",
+            )
+            # all masks share the same set of points for efficient matching
+            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
+            sample_points = 2.0 * sample_points - 1.0
+
+            out_mask = F.grid_sample(
+                masks.detach(), sample_points, align_corners=False
+            ).squeeze(-2)
+            out_mask = out_mask.flatten(0, 1)
+
+            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
+            sample_points = paddle.concat(
+                [a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts) if b > 0]
+            )
+            tgt_mask = F.grid_sample(
+                tgt_mask, sample_points, align_corners=False
+            ).squeeze([1, 2])
+
+            with paddle.amp.auto_cast(enable=False):
+                # binary cross entropy cost
+                pos_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.ones_like(out_mask), reduction="none"
+                )
+                neg_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.zeros_like(out_mask), reduction="none"
+                )
+                cost_mask = paddle.matmul(
+                    pos_cost_mask, tgt_mask, transpose_y=True
+                ) + paddle.matmul(neg_cost_mask, 1 - tgt_mask, transpose_y=True)
+                cost_mask /= self.num_sample_points
+
+                # dice cost
+                out_mask = F.sigmoid(out_mask)
+                numerator = 2 * paddle.matmul(out_mask, tgt_mask, transpose_y=True)
+                denominator = out_mask.sum(-1, keepdim=True) + tgt_mask.sum(
+                    -1
+                ).unsqueeze(0)
+                cost_dice = 1 - (numerator + 1) / (denominator + 1)
+
+                C = (
+                    C
+                    + self.matcher_coeff["mask"] * cost_mask
+                    + self.matcher_coeff["dice"] * cost_dice
+                )
+
+        C = C.reshape([bs, num_queries, -1])
+        C = [a.squeeze(0) for a in C.chunk(bs)]
+        sizes = [a.shape[0] for a in gt_bbox]
+        if hasattr(paddle.Tensor, "contiguous"):
+            indices = [
+                linear_sum_assignment(c.split(sizes, -1)[i].contiguous().numpy())
+                for i, c in enumerate(C)
+            ]
+        else:
+            indices = [
+                linear_sum_assignment(c.split(sizes, -1)[i].numpy())
+                for i, c in enumerate(C)
+            ]
+        return [
+            (
+                paddle.to_tensor(i, dtype=paddle.int64),
+                paddle.to_tensor(j, dtype=paddle.int64),
+            )
+            for i, j in indices
+        ]
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/ops.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/ops.py
new file mode 100644
index 0000000000..c58834943c
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/ops.py
@@ -0,0 +1,1193 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr, in_dynamic_mode
+from paddle.common_ops_import import (
+    LayerHelper,
+    Variable,
+    check_type,
+    check_variable_and_dtype,
+)
+from paddle.regularizer import L2Decay
+
+try:
+    import paddle._legacy_C_ops as C_ops
+except:
+    import paddle._C_ops as C_ops
+
+try:
+    from paddle.framework import in_dynamic_or_pir_mode
+
+    HAVE_PIR = True
+except:
+    HAVE_PIR = False
+
+
+__all__ = [
+    "prior_box",
+    "generate_proposals",
+    "box_coder",
+    "multiclass_nms",
+    "distribute_fpn_proposals",
+    "matrix_nms",
+    "batch_norm",
+    "mish",
+    "silu",
+    "swish",
+    "identity",
+    "anchor_generator",
+]
+
+
+def identity(x):
+    return x
+
+
+def mish(x):
+    return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x))
+
+
+def silu(x):
+    return F.silu(x)
+
+
+def swish(x):
+    return x * F.sigmoid(x)
+
+
+TRT_ACT_SPEC = {"swish": swish, "silu": swish}
+
+ACT_SPEC = {"mish": mish, "silu": silu}
+
+
+def get_act_fn(act=None, trt=False):
+    assert act is None or isinstance(
+        act, (str, dict)
+    ), "name of activation should be str, dict or None"
+    if not act:
+        return identity
+
+    if isinstance(act, dict):
+        name = act["name"]
+        act.pop("name")
+        kwargs = act
+    else:
+        name = act
+        kwargs = dict()
+
+    if trt and name in TRT_ACT_SPEC:
+        fn = TRT_ACT_SPEC[name]
+    elif name in ACT_SPEC:
+        fn = ACT_SPEC[name]
+    else:
+        fn = getattr(F, name)
+
+    return lambda x: fn(x, **kwargs)
+
+
+def batch_norm(
+    ch,
+    norm_type="bn",
+    norm_decay=0.0,
+    freeze_norm=False,
+    initializer=None,
+    data_format="NCHW",
+):
+
+    norm_lr = 0.0 if freeze_norm else 1.0
+    weight_attr = ParamAttr(
+        initializer=initializer,
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True,
+    )
+    bias_attr = ParamAttr(
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True,
+    )
+
+    if norm_type in ["sync_bn", "bn"]:
+        norm_layer = nn.BatchNorm2D(
+            ch, weight_attr=weight_attr, bias_attr=bias_attr, data_format=data_format
+        )
+
+    norm_params = norm_layer.parameters()
+    if freeze_norm:
+        for param in norm_params:
+            param.stop_gradient = True
+
+    return norm_layer
+
+
+@paddle.jit.not_to_static
+def anchor_generator(
+    input,
+    anchor_sizes=None,
+    aspect_ratios=None,
+    variance=[0.1, 0.1, 0.2, 0.2],
+    stride=None,
+    offset=0.5,
+):
+    """
+    **Anchor generator operator**
+    Generate anchors for Faster RCNN algorithm.
+    Each position of the input produce N anchors, N =
+    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
+    is firstly aspect_ratios loop then anchor_sizes loop.
+    Args:
+       input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map.
+       anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated
+          anchors, given in absolute pixels e.g. [64., 128., 256., 512.].
+          For instance, the anchor size of 64 means the area of this anchor
+          equals to 64**2. None by default.
+       aspect_ratios(float32|list|tuple, optional): The height / width ratios
+           of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.
+       variance(list|tuple, optional): The variances to be used in box
+           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by
+           default.
+       stride(list|tuple, optional): The anchors stride across width and height.
+           The data type is float32. e.g. [16.0, 16.0]. None by default.
+       offset(float32, optional): Prior boxes center offset. 0.5 by default.
+    Returns:
+        Tuple:
+        Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
+        H is the height of input, W is the width of input,
+        num_anchors is the box count of each position.
+        Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+
+        Variances(Variable): The expanded variances of anchors
+        with a layout of [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_anchors is the box count of each position.
+        Each variance is in (xcenter, ycenter, w, h) format.
+    Examples:
+        .. code-block:: python
+            import paddle.fluid as fluid
+            conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32')
+            anchor, var = fluid.layers.anchor_generator(
+                input=conv1,
+                anchor_sizes=[64, 128, 256, 512],
+                aspect_ratios=[0.5, 1.0, 2.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+    """
+
+    def _is_list_or_tuple_(data):
+        return isinstance(data, list) or isinstance(data, tuple)
+
+    if not _is_list_or_tuple_(anchor_sizes):
+        anchor_sizes = [anchor_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
+        raise ValueError(
+            "stride should be a list or tuple ",
+            "with length 2, (stride_width, stride_height).",
+        )
+
+    anchor_sizes = list(map(float, anchor_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    stride = list(map(float, stride))
+
+    if in_dynamic_mode():
+        attrs = (
+            "anchor_sizes",
+            anchor_sizes,
+            "aspect_ratios",
+            aspect_ratios,
+            "variances",
+            variance,
+            "stride",
+            stride,
+            "offset",
+            offset,
+        )
+        anchor, var = C_ops.anchor_generator(input, *attrs)
+        return anchor, var
+
+    helper = LayerHelper("anchor_generator", **locals())
+    dtype = helper.input_dtype()
+    attrs = {
+        "anchor_sizes": anchor_sizes,
+        "aspect_ratios": aspect_ratios,
+        "variances": variance,
+        "stride": stride,
+        "offset": offset,
+    }
+
+    anchor = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="anchor_generator",
+        inputs={"Input": input},
+        outputs={"Anchors": anchor, "Variances": var},
+        attrs=attrs,
+    )
+    anchor.stop_gradient = True
+    var.stop_gradient = True
+    return anchor, var
+
+
+@paddle.jit.not_to_static
+def distribute_fpn_proposals(
+    fpn_rois,
+    min_level,
+    max_level,
+    refer_level,
+    refer_scale,
+    pixel_offset=False,
+    rois_num=None,
+    name=None,
+):
+    r"""
+
+    **This op only takes LoDTensor as input.** In Feature Pyramid Networks
+    (FPN) models, it is needed to distribute all proposals into different FPN
+    level, with respect to scale of the proposals, the referring scale and the
+    referring level. Besides, to restore the order of proposals, we return an
+    array which indicates the original index of rois in current proposals.
+    To compute FPN level for each roi, the formula is given as follows:
+
+    .. math::
+
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+
+        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
+    where BBoxArea is a function to compute the area of each roi.
+
+    Args:
+
+        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is
+            float32 or float64. The input fpn_rois.
+        min_level(int32): The lowest level of FPN layer where the proposals come
+            from.
+        max_level(int32): The highest level of FPN layer where the proposals
+            come from.
+        refer_level(int32): The referring level of FPN layer with specified scale.
+        refer_scale(int32): The referring scale of FPN layer with specified level.
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image.
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        Tuple:
+
+        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4]
+        and data type of float32 and float64. The length is
+        max_level-min_level+1. The proposals in each FPN level.
+
+        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is
+        the number of total rois. The data type is int32. It is
+        used to restore the order of fpn_rois.
+
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is
+        the RoIs' number in each image on the corresponding level. The shape
+        is [B] and data type of int32. B is the number of images
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            paddle.enable_static()
+            fpn_rois = paddle.static.data(
+                name='data', shape=[None, 4], dtype='float32', lod_level=1)
+            multi_rois, restore_ind = ops.distribute_fpn_proposals(
+                fpn_rois=fpn_rois,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224)
+    """
+    num_lvl = max_level - min_level + 1
+
+    if in_dynamic_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = (
+            "min_level",
+            min_level,
+            "max_level",
+            max_level,
+            "refer_level",
+            refer_level,
+            "refer_scale",
+            refer_scale,
+            "pixel_offset",
+            pixel_offset,
+        )
+        multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs
+        )
+
+        return multi_rois, restore_ind, rois_num_per_level
+
+    else:
+        check_variable_and_dtype(
+            fpn_rois, "fpn_rois", ["float32", "float64"], "distribute_fpn_proposals"
+        )
+        helper = LayerHelper("distribute_fpn_proposals", **locals())
+        dtype = helper.input_dtype("fpn_rois")
+        multi_rois = [
+            helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
+        ]
+
+        restore_ind = helper.create_variable_for_type_inference(dtype="int32")
+
+        inputs = {"FpnRois": fpn_rois}
+        outputs = {
+            "MultiFpnRois": multi_rois,
+            "RestoreIndex": restore_ind,
+        }
+
+        if rois_num is not None:
+            inputs["RoisNum"] = rois_num
+            rois_num_per_level = [
+                helper.create_variable_for_type_inference(dtype="int32")
+                for i in range(num_lvl)
+            ]
+            outputs["MultiLevelRoIsNum"] = rois_num_per_level
+        else:
+            rois_num_per_level = None
+
+        helper.append_op(
+            type="distribute_fpn_proposals",
+            inputs=inputs,
+            outputs=outputs,
+            attrs={
+                "min_level": min_level,
+                "max_level": max_level,
+                "refer_level": refer_level,
+                "refer_scale": refer_scale,
+                "pixel_offset": pixel_offset,
+            },
+        )
+        return multi_rois, restore_ind, rois_num_per_level
+
+
+@paddle.jit.not_to_static
+def prior_box(
+    input,
+    image,
+    min_sizes,
+    max_sizes=None,
+    aspect_ratios=[1.0],
+    variance=[0.1, 0.1, 0.2, 0.2],
+    flip=False,
+    clip=False,
+    steps=[0.0, 0.0],
+    offset=0.5,
+    min_max_aspect_ratios_order=False,
+    name=None,
+):
+    """
+
+    This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+
+    Parameters:
+       input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
+       image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
+            the data type should be float32 or float64.
+       min_sizes(list|tuple|float): the min sizes of generated prior boxes.
+       max_sizes(list|tuple|None): the max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios(list|tuple|float): the aspect ratios of generated
+            prior boxes. Default: [1.].
+       variance(list|tuple): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip(bool): Whether to flip aspect ratios. Default:False.
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|tuple): Prior boxes step across width and height, If
+            step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across
+            height or weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset(float): Prior boxes center offset. Default: 0.5
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the final
+            detection results. Default: False.
+       name(str, optional): The default value is None.  Normally there is no need for
+            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tuple: A tuple with two Variable (boxes, variances)
+
+        boxes(Tensor): the output prior boxes of PriorBox.
+        4-D tensor, the layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total box count of each position of input.
+
+        variances(Tensor): the expanded variances of PriorBox.
+        4-D tensor, the layput is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total box count of each position of input
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        from ppdet.modeling import ops
+
+        paddle.enable_static()
+        input = paddle.static.data(name="input", shape=[None,3,6,9])
+        image = paddle.static.data(name="image", shape=[None,3,9,12])
+        box, var = ops.prior_box(
+                    input=input,
+                    image=image,
+                    min_sizes=[100.],
+                    clip=True,
+                    flip=True)
+    """
+    return paddle.vision.ops.prior_box(
+        input,
+        image,
+        min_sizes,
+        max_sizes,
+        aspect_ratios,
+        variance,
+        flip,
+        clip,
+        steps,
+        offset,
+        min_max_aspect_ratios_order,
+        name,
+    )
+
+
+@paddle.jit.not_to_static
+def multiclass_nms(
+    bboxes,
+    scores,
+    score_threshold,
+    nms_top_k,
+    keep_top_k,
+    nms_threshold=0.3,
+    normalized=True,
+    nms_eta=1.0,
+    background_label=-1,
+    return_index=False,
+    return_rois_num=True,
+    rois_num=None,
+    name=None,
+):
+    """
+    This operator is to do multi-class non maximum suppression (NMS) on
+    boxes and scores.
+    In the NMS step, this operator greedily selects a subset of detection bounding
+    boxes that have high scores larger than score_threshold, if providing this
+    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+    is larger than -1. Then this operator pruns away boxes that have high IOU
+    (intersection over union) overlap with already selected boxes by adaptive
+    threshold NMS based on parameters of nms_threshold and nms_eta.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): Two types of bboxes are supported:
+                           1. (Tensor) A 3-D Tensor with shape
+                           [N, M, 4 or 8 16 24 32] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
+                           M is the number of bounding boxes, C is the
+                           class number
+        scores (Tensor): Two types of scores are supported:
+                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes.
+                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
+                           M is the number of bbox, C is the class number.
+                           In this case, input BBoxes should be the second
+                           case with shape [M, C, 4].
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score. If not provided,
+                                 consider all boxes.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
+        nms_eta (float): The threshold to be used in NMS. Default: 1.0
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image.
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
+        name(str): Name of the multiclass nms op. Default: None.
+    Returns:
+        A tuple with two Variables: (Out, Index) if return_index is True,
+        otherwise, a tuple with one Variable(Out) is returned.
+        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
+        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+        or A 2-D LoDTensor with shape [No, 10] represents the detections.
+        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,
+        x4, y4]. No is the total number of detections.
+        If all images have not detected results, all elements in LoD will be
+        0, and output tensor is empty (None).
+        Index: Only return when return_index is True. A 2-D LoDTensor with
+        shape [No, 1] represents the selected index which type is Integer.
+        The index is the absolute value cross batches. No is the same number
+        as Out. If the index is used to gather other attribute such as age,
+        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
+        N is the batch size and M is the number of boxes.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            boxes = paddle.static.data(name='bboxes', shape=[81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = paddle.static.data(name='scores', shape=[81],
+                                      dtype='float32', lod_level=1)
+            out, index = ops.multiclass_nms(bboxes=boxes,
+                                            scores=scores,
+                                            background_label=0,
+                                            score_threshold=0.5,
+                                            nms_top_k=400,
+                                            nms_threshold=0.3,
+                                            keep_top_k=200,
+                                            normalized=False,
+                                            return_index=True)
+    """
+    helper = LayerHelper("multiclass_nms3", **locals())
+
+    if HAVE_PIR and in_dynamic_or_pir_mode():
+        # https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/ops/yaml/ops.yaml#L3175
+        attrs = (
+            score_threshold,
+            nms_top_k,
+            keep_top_k,
+            nms_threshold,
+            normalized,
+            nms_eta,
+            background_label,
+        )
+        output, index, nms_rois_num = paddle._C_ops.multiclass_nms3(
+            bboxes, scores, rois_num, *attrs
+        )
+
+        if not return_index:
+            index = None
+        return output, nms_rois_num, index
+
+    elif in_dynamic_mode():
+        attrs = (
+            "background_label",
+            background_label,
+            "score_threshold",
+            score_threshold,
+            "nms_top_k",
+            nms_top_k,
+            "nms_threshold",
+            nms_threshold,
+            "keep_top_k",
+            keep_top_k,
+            "nms_eta",
+            nms_eta,
+            "normalized",
+            normalized,
+        )
+        output, index, nms_rois_num = C_ops.multiclass_nms3(
+            bboxes, scores, rois_num, *attrs
+        )
+        if not return_index:
+            index = None
+        return output, nms_rois_num, index
+
+    else:
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype="int32")
+
+        inputs = {"BBoxes": bboxes, "Scores": scores}
+        outputs = {"Out": output, "Index": index}
+
+        if rois_num is not None:
+            inputs["RoisNum"] = rois_num
+
+        if return_rois_num:
+            nms_rois_num = helper.create_variable_for_type_inference(dtype="int32")
+            outputs["NmsRoisNum"] = nms_rois_num
+
+        helper.append_op(
+            type="multiclass_nms3",
+            inputs=inputs,
+            attrs={
+                "background_label": background_label,
+                "score_threshold": score_threshold,
+                "nms_top_k": nms_top_k,
+                "nms_threshold": nms_threshold,
+                "keep_top_k": keep_top_k,
+                "nms_eta": nms_eta,
+                "normalized": normalized,
+            },
+            outputs=outputs,
+        )
+        output.stop_gradient = True
+        index.stop_gradient = True
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            nms_rois_num = None
+
+        return output, nms_rois_num, index
+
+
+@paddle.jit.not_to_static
+def matrix_nms(
+    bboxes,
+    scores,
+    score_threshold,
+    post_threshold,
+    nms_top_k,
+    keep_top_k,
+    use_gaussian=False,
+    gaussian_sigma=2.0,
+    background_label=0,
+    normalized=True,
+    return_index=False,
+    return_rois_num=True,
+    name=None,
+):
+    """
+    **Matrix NMS**
+    This operator does matrix non maximum suppression (NMS).
+    First selects a subset of candidate bounding boxes that have higher scores
+    than score_threshold (if provided), then the top k candidate is selected if
+    nms_top_k is larger than -1. Score of the remaining candidate are then
+    decayed according to the Matrix NMS scheme.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           The data type is float32 or float64.
+        scores (Tensor): A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes. The data type is float32 or float64.
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score.
+        post_threshold (float): Threshold to filter out bounding boxes with
+                                low confidence score AFTER decaying.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        return_rois_num(bool): whether return rois_num. Default: True
+        name(str): Name of the matrix nms op. Default: None.
+    Returns:
+        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+             detection results.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+             (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1})
+        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+            selected indices, which are absolute values cross batches.
+        rois_num (Tensor): A 1-D Tensor with shape [N] containing
+            the number of detected boxes in each image.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from ppdet.modeling import ops
+            boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = paddle.static.data(name='scores', shape=[None,81],
+                                      dtype='float32', lod_level=1)
+            out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,
+                                 score_threshold=0.5, post_threshold=0.1,
+                                 nms_top_k=400, keep_top_k=200, normalized=False)
+    """
+    check_variable_and_dtype(bboxes, "BBoxes", ["float32", "float64"], "matrix_nms")
+    check_variable_and_dtype(scores, "Scores", ["float32", "float64"], "matrix_nms")
+    check_type(score_threshold, "score_threshold", float, "matrix_nms")
+    check_type(post_threshold, "post_threshold", float, "matrix_nms")
+    check_type(nms_top_k, "nums_top_k", int, "matrix_nms")
+    check_type(keep_top_k, "keep_top_k", int, "matrix_nms")
+    check_type(normalized, "normalized", bool, "matrix_nms")
+    check_type(use_gaussian, "use_gaussian", bool, "matrix_nms")
+    check_type(gaussian_sigma, "gaussian_sigma", float, "matrix_nms")
+    check_type(background_label, "background_label", int, "matrix_nms")
+
+    if in_dynamic_mode():
+        attrs = (
+            "background_label",
+            background_label,
+            "score_threshold",
+            score_threshold,
+            "post_threshold",
+            post_threshold,
+            "nms_top_k",
+            nms_top_k,
+            "gaussian_sigma",
+            gaussian_sigma,
+            "use_gaussian",
+            use_gaussian,
+            "keep_top_k",
+            keep_top_k,
+            "normalized",
+            normalized,
+        )
+        out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs)
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return out, rois_num, index
+    else:
+        helper = LayerHelper("matrix_nms", **locals())
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype="int32")
+        outputs = {"Out": output, "Index": index}
+        if return_rois_num:
+            rois_num = helper.create_variable_for_type_inference(dtype="int32")
+            outputs["RoisNum"] = rois_num
+
+        helper.append_op(
+            type="matrix_nms",
+            inputs={"BBoxes": bboxes, "Scores": scores},
+            attrs={
+                "background_label": background_label,
+                "score_threshold": score_threshold,
+                "post_threshold": post_threshold,
+                "nms_top_k": nms_top_k,
+                "gaussian_sigma": gaussian_sigma,
+                "use_gaussian": use_gaussian,
+                "keep_top_k": keep_top_k,
+                "normalized": normalized,
+            },
+            outputs=outputs,
+        )
+        output.stop_gradient = True
+
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return output, rois_num, index
+
+
+@paddle.jit.not_to_static
+def box_coder(
+    prior_box,
+    prior_box_var,
+    target_box,
+    code_type="encode_center_size",
+    box_normalized=True,
+    axis=0,
+    name=None,
+):
+    r"""
+    **Box Coder Layer**
+    Encode/Decode the target bounding box with the priorbox information.
+
+    The Encoding schema described below:
+    .. math::
+        ox = (tx - px) / pw / pxv
+        oy = (ty - py) / ph / pyv
+        ow = \log(\abs(tw / pw)) / pwv
+        oh = \log(\abs(th / ph)) / phv
+    The Decoding schema described below:
+
+    .. math::
+
+        ox = (pw * pxv * tx * + px) - tw / 2
+        oy = (ph * pyv * ty * + py) - th / 2
+        ow = \exp(pwv * tw) * pw + tw / 2
+        oh = \exp(phv * th) * ph + th / 2
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
+    the priorbox's (anchor) center coordinates, width and height. `pxv`,
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height.
+    During Box Decoding, two modes for broadcast are supported. Say target
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
+    [M, 4]. Then prior box will broadcast to target box along the
+    assigned axis.
+
+    Args:
+        prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape
+            [M, 4] holds M boxes and data type is float32 or float64. Each box
+            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the
+            left top coordinate of the anchor box, if the input is image feature
+            map, they are close to the origin of the coordinate system.
+            [xmax, ymax] is the right bottom coordinate of the anchor box.
+        prior_box_var(List|Tensor|None): prior_box_var supports three types
+            of input. One is Tensor with shape [M, 4] which holds M group and
+            data type is float32 or float64. The second is list consist of
+            4 elements shared by all boxes and data type is float32 or float64.
+            Other is None and not involved in calculation.
+        target_box(Tensor): This input can be a 2-D LoDTensor with shape
+            [N, 4] when code_type is 'encode_center_size'. This input also can
+            be a 3-D Tensor with shape [N, M, 4] when code_type is
+            'decode_center_size'. Each box is represented as
+            [xmin, ymin, xmax, ymax]. The data type is float32 or float64.
+        code_type(str): The code type used with the target box. It can be
+            `encode_center_size` or `decode_center_size`. `encode_center_size`
+            by default.
+        box_normalized(bool): Whether treat the priorbox as a normalized box.
+            Set true by default.
+        axis(int): Which axis in PriorBox to broadcast for box decode,
+            for example, if axis is 0 and TargetBox has shape [N, M, 4] and
+            PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
+            for decoding. It is only valid when code type is
+            `decode_center_size`. Set 0 by default.
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        Tensor:
+        output_box(Tensor): When code_type is 'encode_center_size', the
+        output tensor of box_coder_op with shape [N, M, 4] representing the
+        result of N target boxes encoded with M Prior boxes and variances.
+        When code_type is 'decode_center_size', N represents the batch size
+        and M represents the number of decoded boxes.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            paddle.enable_static()
+            # For encode
+            prior_box_encode = paddle.static.data(name='prior_box_encode',
+                                  shape=[512, 4],
+                                  dtype='float32')
+            target_box_encode = paddle.static.data(name='target_box_encode',
+                                   shape=[81, 4],
+                                   dtype='float32')
+            output_encode = ops.box_coder(prior_box=prior_box_encode,
+                                    prior_box_var=[0.1,0.1,0.2,0.2],
+                                    target_box=target_box_encode,
+                                    code_type="encode_center_size")
+            # For decode
+            prior_box_decode = paddle.static.data(name='prior_box_decode',
+                                  shape=[512, 4],
+                                  dtype='float32')
+            target_box_decode = paddle.static.data(name='target_box_decode',
+                                   shape=[512, 81, 4],
+                                   dtype='float32')
+            output_decode = ops.box_coder(prior_box=prior_box_decode,
+                                    prior_box_var=[0.1,0.1,0.2,0.2],
+                                    target_box=target_box_decode,
+                                    code_type="decode_center_size",
+                                    box_normalized=False,
+                                    axis=1)
+    """
+    check_variable_and_dtype(
+        prior_box, "prior_box", ["float32", "float64"], "box_coder"
+    )
+    check_variable_and_dtype(
+        target_box, "target_box", ["float32", "float64"], "box_coder"
+    )
+
+    if in_dynamic_mode():
+        if isinstance(prior_box_var, Variable):
+            output_box = C_ops.box_coder(
+                prior_box,
+                prior_box_var,
+                target_box,
+                "code_type",
+                code_type,
+                "box_normalized",
+                box_normalized,
+                "axis",
+                axis,
+            )
+
+        elif isinstance(prior_box_var, list):
+            output_box = C_ops.box_coder(
+                prior_box,
+                None,
+                target_box,
+                "code_type",
+                code_type,
+                "box_normalized",
+                box_normalized,
+                "axis",
+                axis,
+                "variance",
+                prior_box_var,
+            )
+        else:
+            raise TypeError("Input variance of box_coder must be Variable or list")
+        return output_box
+    else:
+        helper = LayerHelper("box_coder", **locals())
+
+        output_box = helper.create_variable_for_type_inference(dtype=prior_box.dtype)
+
+        inputs = {"PriorBox": prior_box, "TargetBox": target_box}
+        attrs = {"code_type": code_type, "box_normalized": box_normalized, "axis": axis}
+        if isinstance(prior_box_var, Variable):
+            inputs["PriorBoxVar"] = prior_box_var
+        elif isinstance(prior_box_var, list):
+            attrs["variance"] = prior_box_var
+        else:
+            raise TypeError("Input variance of box_coder must be Variable or list")
+        helper.append_op(
+            type="box_coder",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={"OutputBox": output_box},
+        )
+        return output_box
+
+
+@paddle.jit.not_to_static
+def generate_proposals(
+    scores,
+    bbox_deltas,
+    im_shape,
+    anchors,
+    variances,
+    pre_nms_top_n=6000,
+    post_nms_top_n=1000,
+    nms_thresh=0.5,
+    min_size=0.1,
+    eta=1.0,
+    pixel_offset=False,
+    return_rois_num=False,
+    name=None,
+):
+    """
+    **Generate proposal Faster-RCNN**
+    This operation proposes RoIs according to each box with their
+    probability to be a foreground object and
+    the box can be calculated by anchors. Bbox_deltais and scores
+    to be an object are the output of RPN. Final proposals
+    could be used to train detection net.
+    For generating proposals, this operation performs following steps:
+    1. Transposes and resizes scores and bbox_deltas in size of
+       (H*W*A, 1) and (H*W*A, 4)
+    2. Calculate box locations as proposals candidates.
+    3. Clip boxes to image
+    4. Remove predicted boxes with small area.
+    5. Apply NMS to get final proposals as output.
+    Args:
+        scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents
+            the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and
+            width of the feature map. The data type must be float32.
+        bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W]
+            represents the difference between predicted box location and
+            anchor location. The data type must be float32.
+        im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the
+            origin image size or input size. The data type can be float32 or
+            float64.
+        anchors(Tensor):   A 4-D Tensor represents the anchors with a layout
+            of [H, W, A, 4]. H and W are height and width of the feature map,
+            num_anchors is the box count of each position. Each anchor is
+            in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
+        variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of
+            [H, W, num_priors, 4]. Each variance is in
+            (xcenter, ycenter, w, h) format. The data type must be float32.
+        pre_nms_top_n(float): Number of total bboxes to be kept per
+            image before NMS. The data type must be float32. `6000` by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per
+            image after NMS. The data type must be float32. `1000` by default.
+        nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.
+        min_size(float): Remove predicted boxes with either height or
+            width < min_size. The data type must be float32. `0.1` by default.
+        eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
+            `adaptive_threshold = adaptive_threshold * eta` in each iteration.
+        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's
+            num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
+            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model.
+            'False' by default.
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        tuple:
+        A tuple with format ``(rpn_rois, rpn_roi_probs)``.
+        - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
+        - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            paddle.enable_static()
+            scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
+            bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
+            im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32')
+            anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')
+            variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')
+            rois, roi_probs = ops.generate_proposals(scores, bbox_deltas,
+                         im_shape, anchors, variances)
+    """
+    if in_dynamic_mode():
+        assert return_rois_num, "return_rois_num should be True in dygraph mode."
+        attrs = (
+            "pre_nms_topN",
+            pre_nms_top_n,
+            "post_nms_topN",
+            post_nms_top_n,
+            "nms_thresh",
+            nms_thresh,
+            "min_size",
+            min_size,
+            "eta",
+            eta,
+            "pixel_offset",
+            pixel_offset,
+        )
+        rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2(
+            scores, bbox_deltas, im_shape, anchors, variances, *attrs
+        )
+        if not return_rois_num:
+            rpn_rois_num = None
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+    else:
+        helper = LayerHelper("generate_proposals_v2", **locals())
+
+        check_variable_and_dtype(scores, "scores", ["float32"], "generate_proposals_v2")
+        check_variable_and_dtype(
+            bbox_deltas, "bbox_deltas", ["float32"], "generate_proposals_v2"
+        )
+        check_variable_and_dtype(
+            im_shape, "im_shape", ["float32", "float64"], "generate_proposals_v2"
+        )
+        check_variable_and_dtype(
+            anchors, "anchors", ["float32"], "generate_proposals_v2"
+        )
+        check_variable_and_dtype(
+            variances, "variances", ["float32"], "generate_proposals_v2"
+        )
+
+        rpn_rois = helper.create_variable_for_type_inference(dtype=bbox_deltas.dtype)
+        rpn_roi_probs = helper.create_variable_for_type_inference(dtype=scores.dtype)
+        outputs = {
+            "RpnRois": rpn_rois,
+            "RpnRoiProbs": rpn_roi_probs,
+        }
+        if return_rois_num:
+            rpn_rois_num = helper.create_variable_for_type_inference(dtype="int32")
+            rpn_rois_num.stop_gradient = True
+            outputs["RpnRoisNum"] = rpn_rois_num
+
+        helper.append_op(
+            type="generate_proposals_v2",
+            inputs={
+                "Scores": scores,
+                "BboxDeltas": bbox_deltas,
+                "ImShape": im_shape,
+                "Anchors": anchors,
+                "Variances": variances,
+            },
+            attrs={
+                "pre_nms_topN": pre_nms_top_n,
+                "post_nms_topN": post_nms_top_n,
+                "nms_thresh": nms_thresh,
+                "min_size": min_size,
+                "eta": eta,
+                "pixel_offset": pixel_offset,
+            },
+            outputs=outputs,
+        )
+        rpn_rois.stop_gradient = True
+        rpn_roi_probs.stop_gradient = True
+        if not return_rois_num:
+            rpn_rois_num = None
+
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+
+def sigmoid_cross_entropy_with_logits(input, label, ignore_index=-100, normalize=False):
+    output = F.binary_cross_entropy_with_logits(input, label, reduction="none")
+    mask_tensor = paddle.cast(label != ignore_index, "float32")
+    output = paddle.multiply(output, mask_tensor)
+    if normalize:
+        sum_valid_mask = paddle.sum(mask_tensor)
+        output = output / sum_valid_mask
+    return output
+
+
+def smooth_l1(input, label, inside_weight=None, outside_weight=None, sigma=None):
+    input_new = paddle.multiply(input, inside_weight)
+    label_new = paddle.multiply(label, inside_weight)
+    delta = 1 / (sigma * sigma)
+    out = F.smooth_l1_loss(input_new, label_new, reduction="none", delta=delta)
+    out = paddle.multiply(out, outside_weight)
+    out = out / delta
+    out = paddle.reshape(out, shape=[out.shape[0], -1])
+    out = paddle.sum(out, axis=1)
+    return out
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    assert num_channels % groups == 0, "num_channels should be divisible by groups"
+    channels_per_group = num_channels // groups
+    x = paddle.reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width]
+    )
+    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+def get_static_shape(tensor):
+    shape = paddle.shape(tensor)
+    shape.stop_gradient = True
+    return shape
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/position_encoding.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/position_encoding.py
new file mode 100644
index 0000000000..442b1b0cf2
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/position_encoding.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import, division, print_function
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+
+class PositionEmbedding(nn.Layer):
+    def __init__(
+        self,
+        num_pos_feats=128,
+        temperature=10000,
+        normalize=True,
+        scale=2 * math.pi,
+        embed_type="sine",
+        num_embeddings=50,
+        offset=0.0,
+        eps=1e-6,
+    ):
+        super(PositionEmbedding, self).__init__()
+        assert embed_type in ["sine", "learned"]
+
+        self.embed_type = embed_type
+        self.offset = offset
+        self.eps = eps
+        if self.embed_type == "sine":
+            self.num_pos_feats = num_pos_feats
+            self.temperature = temperature
+            self.normalize = normalize
+            self.scale = scale
+        elif self.embed_type == "learned":
+            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
+            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
+        else:
+            raise ValueError(f"{self.embed_type} is not supported.")
+
+    def forward(self, mask):
+        """
+        Args:
+            mask (Tensor): [B, H, W]
+        Returns:
+            pos (Tensor): [B, H, W, C]
+        """
+        if self.embed_type == "sine":
+            y_embed = mask.cumsum(1)
+            x_embed = mask.cumsum(2)
+            if self.normalize:
+                y_embed = (
+                    (y_embed + self.offset)
+                    / (y_embed[:, -1:, :] + self.eps)
+                    * self.scale
+                )
+                x_embed = (
+                    (x_embed + self.offset)
+                    / (x_embed[:, :, -1:] + self.eps)
+                    * self.scale
+                )
+
+            dim_t = 2 * (paddle.arange(self.num_pos_feats) // 2).astype("float32")
+            dim_t = self.temperature ** (dim_t / self.num_pos_feats)
+
+            pos_x = x_embed.unsqueeze(-1) / dim_t
+            pos_y = y_embed.unsqueeze(-1) / dim_t
+            pos_x = paddle.stack(
+                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4
+            ).flatten(3)
+            pos_y = paddle.stack(
+                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4
+            ).flatten(3)
+            return paddle.concat((pos_y, pos_x), axis=3)
+        elif self.embed_type == "learned":
+            h, w = mask.shape[-2:]
+            i = paddle.arange(w)
+            j = paddle.arange(h)
+            x_emb = self.col_embed(i)
+            y_emb = self.row_embed(j)
+            return paddle.concat(
+                [
+                    x_emb.unsqueeze(0).tile([h, 1, 1]),
+                    y_emb.unsqueeze(1).tile([1, w, 1]),
+                ],
+                axis=-1,
+            ).unsqueeze(0)
+        else:
+            raise ValueError(f"not supported {self.embed_type}")
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/utils.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/utils.py
new file mode 100644
index 0000000000..b16db76adc
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/modules/utils.py
@@ -0,0 +1,544 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import, division, print_function
+
+import copy
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = [
+    "_get_clones",
+    "bbox_cxcywh_to_xyxy",
+    "bbox_xyxy_to_cxcywh",
+    "sigmoid_focal_loss",
+    "inverse_sigmoid",
+    "deformable_attention_core_func",
+    "varifocal_loss_with_logits",
+    "mal_loss_with_logits",
+]
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+def bbox_cxcywh_to_xyxy(x):
+    cxcy, wh = paddle.split(x, 2, axis=-1)
+    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
+
+
+def bbox_xyxy_to_cxcywh(x):
+    x1, y1, x2, y2 = x.split(4, axis=-1)
+    return paddle.concat([(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
+
+
+def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
+    prob = F.sigmoid(logit)
+    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
+    p_t = prob * label + (1 - prob) * (1 - label)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * label + (1 - alpha) * (1 - label)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / normalizer
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clip(min=0.0, max=1.0)
+    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def deformable_attention_core_func(
+    value,
+    value_spatial_shapes,
+    value_level_start_index,
+    sampling_locations,
+    attention_weights,
+):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, axis=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = (
+            value_list[level]
+            .flatten(2)
+            .transpose([0, 2, 1])
+            .reshape([bs * n_head, c, h, w])
+        )
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = (
+            sampling_grids[:, :, :, level].transpose([0, 2, 1, 3, 4]).flatten(0, 1)
+        )
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode="bilinear",
+            padding_mode="zeros",
+            align_corners=False,
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
+        [bs * n_head, 1, Len_q, n_levels * n_points]
+    )
+    output = (
+        (paddle.stack(sampling_value_list, axis=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .reshape([bs, n_head * c, Len_q])
+    )
+
+    return output.transpose([0, 2, 1])
+
+
+def discrete_sample(x, grid):
+    """
+    Args:
+        x (Tensor): [N, C, H, W]
+        grid (Tensor): [N, grid_H, grid_W, 2]
+    Returns:
+        output (Tensor): [N, C, grid_H, grid_W]
+    """
+    N, C, H, W = x.shape
+    _, grid_H, grid_W, _ = grid.shape
+    spatial_shape = paddle.to_tensor([[W, H]], dtype=paddle.float32)
+    index = (grid * spatial_shape + 0.5).astype(paddle.int64).flatten(1, 2)
+    h_index = index[:, :, 1].clip(0, H - 1)
+    w_index = index[:, :, 0].clip(0, W - 1)
+    batch_index = paddle.arange(N).unsqueeze(-1).tile([1, grid_H * grid_W])
+    output = x[batch_index, :, h_index, w_index]
+    output = output.transpose([0, 2, 1]).reshape([N, C, grid_H, grid_W])
+    return output
+
+
+def deformable_attention_core_func_v2(
+    value,
+    value_spatial_shapes,
+    sampling_locations,
+    attention_weights,
+    num_points_list,
+    sampling_method="default",
+):
+    """
+    Args:
+        value (Tensor): [batch_num, value_len, num_heads, head_dim]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        sampling_locations (Tensor): [batch_num, query_len, num_heads, total_num_points, 2]
+        attention_weights (Tensor): [batch_num, query_len, num_heads, total_num_points]
+        num_points_list (List): The number of sampling point corresponding to each level
+        sampling_method (str): default(grid_sample) or discrete(discrete_sample)
+
+    Returns:
+        output (Tensor): [batch_num, query_len, num_heads * head_dim]
+    """
+    assert sampling_method in ["default", "discrete"], NotImplementedError
+    batch_num, _, num_heads, head_dim = value.shape
+    query_len = sampling_locations.shape[1]
+    num_levels = len(num_points_list)
+
+    value = value.transpose([0, 2, 3, 1]).flatten(0, 1)
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, axis=-1)
+    value_list = [
+        value.reshape([-1, head_dim, h, w])
+        for value, (h, w) in zip(value_list, value_spatial_shapes)
+    ]
+
+    if sampling_method == "default":
+        sampling_grids = 2 * sampling_locations - 1
+    else:
+        sampling_grids = sampling_locations
+
+    sampling_grids = sampling_grids.transpose([0, 2, 1, 3, 4]).flatten(0, 1)
+    sampling_grids_list = sampling_grids.split(num_points_list, axis=-2)
+
+    sampling_value_list = []
+    for idx in range(num_levels):
+        # value_list[idx]: [batch_num * num_heads, head_dim, h, w]
+        # sampling_grids_list[idx]: [batch_num * num_heads, query_len, num_points, 2]
+        # _sampling_value: [batch_num * num_heads, head_dim, query_len, num_points]
+        if sampling_method == "default":
+            _sampling_value = F.grid_sample(
+                value_list[idx],
+                sampling_grids_list[idx],
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+        else:
+            _sampling_value = discrete_sample(value_list[idx], sampling_grids_list[idx])
+        sampling_value_list.append(_sampling_value)
+
+    attn_weights = attention_weights.transpose([0, 2, 1, 3])
+    attn_weights = attn_weights.flatten(0, 1).unsqueeze(1)
+    sampling_value = paddle.concat(sampling_value_list, axis=-1)
+    # attn_weights: [batch_num * num_heads, 1, query_len, total_num_points]
+    # sampling_value: [batch_num * num_heads, head_dim, query_len, total_num_points]
+    # output: [batch_num * num_heads, head_dim, query_len]
+    output = (sampling_value * attn_weights).sum(-1)
+    output = output.reshape([batch_num, num_heads * head_dim, query_len])
+    return output.transpose([0, 2, 1])
+
+
+def get_valid_ratio(mask):
+    _, H, W = mask.shape
+    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
+    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
+    # [b, 2]
+    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+
+
+def get_denoising_training_group(
+    targets,
+    num_classes,
+    num_queries,
+    class_embed,
+    num_denoising=100,
+    label_noise_ratio=0.5,
+    box_noise_scale=1.0,
+):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full([bs, max_gt_num], num_classes, dtype="int32")
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+
+    input_query_class = input_query_class.tile([1, num_group])
+    input_query_bbox = input_query_bbox.tile([1, num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, num_group])
+
+    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx, [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = paddle.assign(input_query_class.flatten())
+        pad_gt_mask = paddle.assign(pad_gt_mask.flatten())
+        # half of bbox prob, cast mask from bool to float bacause dtype promotaion
+        # between bool and float is not supported in static mode.
+        mask = paddle.cast(
+            paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5),
+            paddle.float32,
+        )
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype
+        )
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        diff = (
+            paddle.concat(
+                [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]], axis=-1
+            )
+            * box_noise_scale
+        )
+        diff *= paddle.rand(input_query_bbox.shape) * 2.0 - 1.0
+        input_query_bbox += diff
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat([class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(), axis=0
+    ).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[
+                max_gt_num * i : max_gt_num * (i + 1),
+                max_gt_num * (i + 1) : num_denoising,
+            ] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * i : max_gt_num * (i + 1), : max_gt_num * i] = True
+        else:
+            attn_mask[
+                max_gt_num * i : max_gt_num * (i + 1),
+                max_gt_num * (i + 1) : num_denoising,
+            ] = True
+            attn_mask[max_gt_num * i : max_gt_num * (i + 1), : max_gt_num * i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries],
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_contrastive_denoising_training_group(
+    targets,
+    num_classes,
+    num_queries,
+    class_embed,
+    num_denoising=100,
+    label_noise_ratio=0.5,
+    box_noise_scale=1.0,
+):
+    if num_denoising <= 0:
+        return None, None, None, None
+    # listcomp is not well-supported in SOT mode for now.
+    num_gts = []
+    for t in targets["gt_class"]:
+        num_gts.append(len(t))
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full([bs, max_gt_num], num_classes, dtype="int32")
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx, [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = paddle.assign(input_query_class.flatten())
+        pad_gt_mask = paddle.assign(pad_gt_mask.flatten())
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask.cast(pad_gt_mask.dtype) * pad_gt_mask).squeeze(
+            -1
+        )
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype
+        )
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
+
+        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+
+        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = paddle.rand(input_query_bbox.shape)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
+            1 - negative_gt_mask
+        )
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat([class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(), axis=0
+    ).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[
+                max_gt_num * 2 * i : max_gt_num * 2 * (i + 1),
+                max_gt_num * 2 * (i + 1) : num_denoising,
+            ] = True
+        if i == num_group - 1:
+            attn_mask[
+                max_gt_num * 2 * i : max_gt_num * 2 * (i + 1), : max_gt_num * i * 2
+            ] = True
+        else:
+            attn_mask[
+                max_gt_num * 2 * i : max_gt_num * 2 * (i + 1),
+                max_gt_num * 2 * (i + 1) : num_denoising,
+            ] = True
+            attn_mask[
+                max_gt_num * 2 * i : max_gt_num * 2 * (i + 1), : max_gt_num * 2 * i
+            ] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries],
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_sine_pos_embed(
+    pos_tensor, num_pos_feats=128, temperature=10000, exchange_xy=True
+):
+    """generate sine position embedding from a position tensor
+
+    Args:
+        pos_tensor (Tensor): Shape as `(None, n)`.
+        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
+        temperature (int): The temperature used for scaling
+            the position embedding. Default: 10000.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is `[x, y]`, the results will  # noqa
+            be `[pos(y), pos(x)]`. Defaults: True.
+
+    Returns:
+        Tensor: Returned position embedding  # noqa
+        with shape `(None, n * num_pos_feats)`.
+    """
+    scale = 2.0 * math.pi
+    dim_t = 2.0 * paddle.floor_divide(paddle.arange(num_pos_feats), paddle.to_tensor(2))
+    dim_t = scale / temperature ** (dim_t / num_pos_feats)
+
+    def sine_func(x):
+        x *= dim_t
+        return paddle.stack((x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(
+            2
+        )
+
+    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = paddle.concat(pos_res, axis=2)
+    return pos_res
+
+
+def mask_to_box_coordinate(mask, normalize=False, format="xyxy", dtype="float32"):
+    """
+    Compute the bounding boxes around the provided mask.
+    Args:
+        mask (Tensor:bool): [b, c, h, w]
+
+    Returns:
+        bbox (Tensor): [b, c, 4]
+    """
+    assert mask.ndim == 4
+    assert format in ["xyxy", "xywh"]
+
+    h, w = mask.shape[-2:]
+    y, x = paddle.meshgrid(
+        paddle.arange(end=h, dtype=dtype), paddle.arange(end=w, dtype=dtype)
+    )
+
+    x_mask = x * mask.astype(x.dtype)
+    x_max = x_mask.flatten(-2).max(-1) + 1
+    x_min = (
+        paddle.where(mask.astype(bool), x_mask, paddle.to_tensor(1e8))
+        .flatten(-2)
+        .min(-1)
+    )
+
+    y_mask = y * mask.astype(y.dtype)
+    y_max = y_mask.flatten(-2).max(-1) + 1
+    y_min = (
+        paddle.where(mask.astype(bool), y_mask, paddle.to_tensor(1e8))
+        .flatten(-2)
+        .min(-1)
+    )
+    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
+    mask = mask.any(axis=[2, 3]).unsqueeze(2)
+    out_bbox = out_bbox * mask.astype(out_bbox.dtype)
+    if normalize:
+        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
+
+    return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
+
+
+def varifocal_loss_with_logits(
+    pred_logits, gt_score, label, normalizer=1.0, alpha=0.75, gamma=2.0
+):
+    pred_score = F.sigmoid(pred_logits)
+    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+    loss = F.binary_cross_entropy_with_logits(
+        pred_logits, gt_score, weight=weight, reduction="none"
+    )
+    return loss.mean(1).sum() / normalizer
+
+
+def mal_loss_with_logits(
+    pred_logits, gt_score, label, normalizer=1.0, alpha=1.0, gamma=1.5
+):
+    pred_score = F.sigmoid(pred_logits)
+    gt_score = gt_score.pow(gamma)
+    weight = alpha * pred_score.pow(gamma) * (1 - label) + label
+    loss = F.binary_cross_entropy_with_logits(
+        pred_logits, gt_score, weight=weight, reduction="none"
+    )
+    return loss.mean(1).sum() / normalizer
diff --git a/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/rtdetr_transformer.py b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/rtdetr_transformer.py
new file mode 100644
index 0000000000..fd9fc6ddbf
--- /dev/null
+++ b/paddlex/inference/models/object_detection/modeling/rtdetrl_modules/rtdetr_transformer.py
@@ -0,0 +1,646 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from .detr_head import MLP
+from .modules.deformable_transformer import MSDeformableAttention
+from .modules.detr_ops import _get_clones, inverse_sigmoid
+from .modules.initializer import (
+    bias_init_with_prob,
+    constant_,
+    linear_init_,
+    xavier_uniform_,
+)
+from .modules.layers import MultiHeadAttention
+from .modules.utils import get_contrastive_denoising_training_group
+
+__all__ = ["RTDETRTransformer"]
+
+
+class PPMSDeformableAttention(MSDeformableAttention):
+    def forward(
+        self,
+        query,
+        reference_points,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        value_mask=None,
+    ):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]
+        )
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points]
+        )
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points]
+        )
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2]
+            )
+            sampling_locations = (
+                reference_points.reshape([bs, Len_q, 1, self.num_levels, 1, 2])
+                + sampling_offsets / offset_normalizer
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets
+                / self.num_points
+                * reference_points[:, :, None, :, None, 2:]
+                * 0.5
+            )
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
+                    reference_points.shape[-1]
+                )
+            )
+
+        if not isinstance(query, paddle.Tensor):
+            from .modules.utils import deformable_attention_core_func
+
+            output = deformable_attention_core_func(
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                sampling_locations,
+                attention_weights,
+            )
+        else:
+            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
+            value_level_start_index = paddle.to_tensor(value_level_start_index)
+            output = self.ms_deformable_attn_core(
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                sampling_locations,
+                attention_weights,
+            )
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(
+        self,
+        d_model=256,
+        n_head=8,
+        dim_feedforward=1024,
+        dropout=0.0,
+        activation="relu",
+        n_levels=4,
+        n_points=4,
+        weight_attr=None,
+        bias_attr=None,
+    ):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+        )
+
+        # cross attention
+        self.cross_attn = PPMSDeformableAttention(
+            d_model, n_head, n_levels, n_points, 1.0
+        )
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+        )
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, bias_attr)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, bias_attr)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+        )
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(
+        self,
+        tgt,
+        reference_points,
+        memory,
+        memory_spatial_shapes,
+        memory_level_start_index,
+        attn_mask=None,
+        memory_mask=None,
+        query_pos_embed=None,
+    ):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype("bool"),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype),
+            )
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed),
+            reference_points,
+            memory,
+            memory_spatial_shapes,
+            memory_level_start_index,
+            memory_mask,
+        )
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(
+        self,
+        tgt,
+        ref_points_unact,
+        memory,
+        memory_spatial_shapes,
+        memory_level_start_index,
+        bbox_head,
+        score_head,
+        query_pos_head,
+        attn_mask=None,
+        memory_mask=None,
+        query_pos_head_inv_sig=False,
+    ):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            if not query_pos_head_inv_sig:
+                query_pos_embed = query_pos_head(ref_points_detach)
+            else:
+                query_pos_embed = query_pos_head(inverse_sigmoid(ref_points_detach))
+
+            output = layer(
+                output,
+                ref_points_input,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask,
+                memory_mask,
+                query_pos_embed,
+            )
+
+            inter_ref_bbox = F.sigmoid(
+                bbox_head[i](output) + inverse_sigmoid(ref_points_detach)
+            )
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(
+                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))
+                    )
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = (
+                inter_ref_bbox.detach() if self.training else inter_ref_bbox
+            )
+
+        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
+
+
+class RTDETRTransformer(nn.Layer):
+    __shared__ = ["num_classes", "hidden_dim", "eval_size"]
+
+    def __init__(
+        self,
+        num_classes=80,
+        hidden_dim=256,
+        num_queries=300,
+        position_embed_type="sine",
+        backbone_feat_channels=[512, 1024, 2048],
+        feat_strides=[8, 16, 32],
+        num_levels=3,
+        num_decoder_points=4,
+        nhead=8,
+        num_decoder_layers=6,
+        dim_feedforward=1024,
+        dropout=0.0,
+        activation="relu",
+        num_denoising=100,
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learnt_init_query=True,
+        query_pos_head_inv_sig=False,
+        eval_size=None,
+        eval_idx=-1,
+        eps=1e-2,
+    ):
+        super(RTDETRTransformer, self).__init__()
+        assert position_embed_type in [
+            "sine",
+            "learned",
+        ], f"ValueError: position_embed_type not supported {position_embed_type}!"
+        assert len(backbone_feat_channels) <= num_levels
+        assert len(feat_strides) == len(backbone_feat_channels)
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.eval_size = eval_size
+
+        # backbone feature projection
+        self._build_input_proj_layer(backbone_feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_levels,
+            num_decoder_points,
+        )
+        self.decoder = TransformerDecoder(
+            hidden_dim, decoder_layer, num_decoder_layers, eval_idx
+        )
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()),
+        )
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+        self.query_pos_head_inv_sig = query_pos_head_inv_sig
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            ),
+        )
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.LayerList(
+            [nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers)]
+        )
+        self.dec_bbox_head = nn.LayerList(
+            [
+                MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+                for _ in range(num_decoder_layers)
+            ]
+        )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+
+        # init encoder output anchors and valid_mask
+        if self.eval_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {"backbone_feat_channels": [i.channels for i in input_shape]}
+
+    def _build_input_proj_layer(self, backbone_feat_channels):
+        self.input_proj = nn.LayerList()
+        for in_channels in backbone_feat_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    (
+                        "conv",
+                        nn.Conv2D(
+                            in_channels, self.hidden_dim, kernel_size=1, bias_attr=False
+                        ),
+                    ),
+                    (
+                        "norm",
+                        nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        ),
+                    ),
+                )
+            )
+        in_channels = backbone_feat_channels[-1]
+        for _ in range(self.num_levels - len(backbone_feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(
+                    (
+                        "conv",
+                        nn.Conv2D(
+                            in_channels,
+                            self.hidden_dim,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            bias_attr=False,
+                        ),
+                    ),
+                    (
+                        "norm",
+                        nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        ),
+                    ),
+                )
+            )
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [
+            0,
+        ]
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return (feat_flatten, spatial_shapes, level_start_index)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
+        # input projection and embedding
+        (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = (
+                get_contrastive_denoising_training_group(
+                    gt_meta,
+                    self.num_classes,
+                    self.num_queries,
+                    self.denoising_class_embed.weight,
+                    self.num_denoising,
+                    self.label_noise_ratio,
+                    self.box_noise_scale,
+                )
+            )
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = (
+                None,
+                None,
+                None,
+                None,
+            )
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = (
+            self._get_decoder_input(
+                memory,
+                spatial_shapes,
+                denoising_class,
+                denoising_bbox_unact,
+                is_teacher,
+            )
+        )
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            target,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask,
+            memory_mask=None,
+            query_pos_head_inv_sig=self.query_pos_head_inv_sig,
+        )
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta)
+
+    def _generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype="float32"):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
+                for s in self.feat_strides
+            ]
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(end=h, dtype=dtype), paddle.arange(end=w, dtype=dtype)
+            )
+            grid_xy = paddle.stack([grid_x, grid_y], -1)
+
+            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            anchors.append(paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+
+        anchors = paddle.concat(anchors, 1)
+        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(
+            -1, keepdim=True
+        )
+        anchors = paddle.log(anchors / (1 - anchors))
+        anchors = paddle.where(valid_mask, anchors, paddle.to_tensor(float("inf")))
+        return anchors, valid_mask
+
+    def _get_decoder_input(
+        self,
+        memory,
+        spatial_shapes,
+        denoising_class=None,
+        denoising_bbox_unact=None,
+        is_teacher=False,
+    ):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        if self.training or self.eval_size is None or is_teacher:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.0))
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = paddle.topk(enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        reference_points_unact = paddle.gather_nd(
+            enc_outputs_coord_unact, topk_ind
+        )  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1
+            )
+        if self.training:
+            reference_points_unact = reference_points_unact.detach()
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind)
+            if self.training:
+                target = target.detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
diff --git a/paddlex/inference/models/object_detection/predictor.py b/paddlex/inference/models/object_detection/predictor.py
index e5859b3388..ce2b972172 100644
--- a/paddlex/inference/models/object_detection/predictor.py
+++ b/paddlex/inference/models/object_detection/predictor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ....modules.object_detection.model_list import MODELS
+from ....utils.device import TemporaryDeviceChanger
 from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
 from ..base import BasePredictor
@@ -104,6 +105,7 @@ def __init__(
                     "small",
                 ], f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'] or a dict, but got {layout_merge_bboxes_mode}"
 
+        self.device = kwargs.get("device", None)
         self.img_size = img_size
         self.threshold = threshold
         self.layout_nms = layout_nms
@@ -140,7 +142,24 @@ def _build(self) -> Tuple:
             pre_ops.insert(1, self.build_resize(self.img_size, False, 2))
 
         # build infer
-        infer = self.create_static_infer()
+        if self._use_static_model:
+            infer = self.create_static_infer()
+        else:
+            if self.model_name == "RT-DETR-L":
+                from .modeling import RTDETR
+
+                with TemporaryDeviceChanger(self.device):
+                    infer = RTDETR.from_pretrained(
+                        self.model_dir,
+                        use_safetensors=True,
+                        convert_from_hf=True,
+                        dtype="float32",
+                    )
+                    infer.eval()
+            else:
+                raise RuntimeError(
+                    f"There is no dynamic graph implementation for model {repr(self.model_name)}."
+                )
 
         # build postprocess op
         post_op = self.build_postprocess()
@@ -194,7 +213,7 @@ def _format_output(self, pred: Sequence[Any]) -> List[dict]:
 
         if len(pred) == 3:
             return [
-                {"boxes": np.array(pred_box[i]), "masks": np.array(pred_mask[i])}
+                {"boxes": np.asarray(pred_box[i]), "masks": np.asarray(pred_mask[i])}
                 for i in range(len(pred_box))
             ]
         else:
@@ -231,7 +250,11 @@ def process(
         batch_inputs = self.pre_ops[-1](datas)
 
         # do infer
-        batch_preds = self.infer(batch_inputs)
+        if self._use_static_model:
+            batch_preds = self.infer(batch_inputs)
+        else:
+            with TemporaryDeviceChanger(self.device):
+                batch_preds = self.infer(batch_inputs)
 
         # process a batch of predictions into a list of single image result
         preds_list = self._format_output(batch_preds)
diff --git a/paddlex/inference/models/object_detection/processors.py b/paddlex/inference/models/object_detection/processors.py
index cd3969e2f8..9d898d2ea7 100644
--- a/paddlex/inference/models/object_detection/processors.py
+++ b/paddlex/inference/models/object_detection/processors.py
@@ -750,8 +750,8 @@ def apply(
             boxes = np.array(boxes[selected_indices])
 
         filter_large_image = True
-        # boxes.shape[1] == 6 is object detection, 8 is ordered object detection
-        if filter_large_image and len(boxes) > 1 and boxes.shape[1] in [6, 8]:
+        # boxes.shape[1] == 6 is object detection, 7 is new ordered object detection, 8 is ordered object detection
+        if filter_large_image and len(boxes) > 1 and boxes.shape[1] in [6, 7, 8]:
             if img_size[0] > img_size[1]:
                 area_thres = 0.82
             else:
@@ -837,7 +837,7 @@ def apply(
                 boxes = boxes[keep_mask]
 
         if boxes.size == 0:
-            return np.array([])
+            return []
 
         if boxes.shape[1] == 8:
             # Sort boxes by their order
@@ -845,6 +845,12 @@ def apply(
             sorted_boxes = boxes[sorted_idx]
             boxes = sorted_boxes[:, :6]
 
+        if boxes.shape[1] == 7:
+            # Sort boxes by their order
+            sorted_idx = np.argsort(boxes[:, 6])
+            sorted_boxes = boxes[sorted_idx]
+            boxes = sorted_boxes[:, :6]
+
         if layout_unclip_ratio:
             if isinstance(layout_unclip_ratio, float):
                 layout_unclip_ratio = (layout_unclip_ratio, layout_unclip_ratio)
diff --git a/paddlex/inference/models/table_structure_recognition/modeling/__init__.py b/paddlex/inference/models/table_structure_recognition/modeling/__init__.py
new file mode 100644
index 0000000000..bcdcd336ea
--- /dev/null
+++ b/paddlex/inference/models/table_structure_recognition/modeling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .slanext import SLANeXt
diff --git a/paddlex/inference/models/table_structure_recognition/modeling/slanext.py b/paddlex/inference/models/table_structure_recognition/modeling/slanext.py
new file mode 100644
index 0000000000..884339f0fa
--- /dev/null
+++ b/paddlex/inference/models/table_structure_recognition/modeling/slanext.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ....utils.benchmark import add_inference_operations, benchmark
+from ...common.transformers.transformers import PretrainedConfig, PretrainedModel
+from .slanext_modules.rec_vary_vit import Vary_VIT_B
+from .slanext_modules.table_att_head import SLAHead
+
+__all__ = ["SLANeXt"]
+
+
+class SLANeXtConfig(PretrainedConfig):
+    def __init__(
+        self,
+        backbone,
+        SLAHead,
+    ):
+        if backbone["name"] == "Vary_VIT_B":
+            self.image_size = backbone["image_size"]
+            self.encoder_embed_dim = backbone["encoder_embed_dim"]
+            self.encoder_depth = backbone["encoder_depth"]
+            self.encoder_num_heads = backbone["encoder_num_heads"]
+            self.encoder_global_attn_indexes = backbone["encoder_global_attn_indexes"]
+        else:
+            raise RuntimeError(
+                f"There is no dynamic graph implementation for backbone {backbone['name']}."
+            )
+        self.out_channels = SLAHead["out_channels"]
+        self.hidden_size = SLAHead["hidden_size"]
+        self.max_text_length = SLAHead["max_text_length"]
+        self.loc_reg_num = SLAHead["loc_reg_num"]
+        self.tensor_parallel_degree = 1
+
+
+class SLANeXt(PretrainedModel):
+
+    config_class = SLANeXtConfig
+
+    def __init__(self, config: SLANeXtConfig):
+        super().__init__(config)
+        self.backbone = Vary_VIT_B(
+            image_size=self.config.image_size,
+            encoder_embed_dim=self.config.encoder_embed_dim,
+            encoder_depth=self.config.encoder_depth,
+            encoder_num_heads=self.config.encoder_num_heads,
+            encoder_global_attn_indexes=self.config.encoder_global_attn_indexes,
+        )
+        self.head = SLAHead(
+            in_channels=self.backbone.out_channels,
+            out_channels=self.config.out_channels,
+            hidden_size=self.config.hidden_size,
+            max_text_length=self.config.max_text_length,
+            loc_reg_num=self.config.loc_reg_num,
+        )
+
+    add_inference_operations("slanext_forward")
+
+    @benchmark.timeit_with_options(name="slanext_forward")
+    def forward(self, x):
+        x = paddle.to_tensor(x[0])
+        x = self.backbone(x)
+        x = self.head(x)
+        return [x["loc_preds"], x["structure_probs"]]
+
+    def get_transpose_weight_keys(self):
+        transpose_keys = ["mlp.lin2", "attn.qkv", "mlp.lin1"]
+        need_to_transpose = []
+        all_weight_keys = []
+        for name, param in self.backbone.named_parameters():
+            all_weight_keys.append("backbone." + name)
+        for i in range(len(all_weight_keys)):
+            for j in range(len(transpose_keys)):
+                if (transpose_keys[j] in all_weight_keys[i]) and (
+                    "bias" not in all_weight_keys[i]
+                ):
+                    need_to_transpose.append(all_weight_keys[i])
+        return need_to_transpose
+
+    def get_hf_state_dict(self, *args, **kwargs):
+
+        model_state_dict = self.state_dict(*args, **kwargs)
+
+        hf_state_dict = {}
+        for old_key, value in model_state_dict.items():
+            if "_mean" in old_key:
+                new_key = old_key.replace("_mean", "running_mean")
+            elif "_variance" in old_key:
+                new_key = old_key.replace("_variance", "running_var")
+            else:
+                new_key = old_key
+            hf_state_dict[new_key] = value
+
+        return hf_state_dict
+
+    def set_hf_state_dict(self, state_dict, *args, **kwargs):
+
+        key_mapping = {}
+        for old_key in list(state_dict.keys()):
+            if "running_mean" in old_key:
+                key_mapping[old_key] = old_key.replace("running_mean", "_mean")
+            elif "running_var" in old_key:
+                key_mapping[old_key] = old_key.replace("running_var", "_variance")
+
+        for old_key, new_key in key_mapping.items():
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        return self.set_state_dict(state_dict, *args, **kwargs)
diff --git a/paddlex/inference/models/table_structure_recognition/modeling/slanext_modules/rec_vary_vit.py b/paddlex/inference/models/table_structure_recognition/modeling/slanext_modules/rec_vary_vit.py
new file mode 100644
index 0000000000..3f1a7e19f8
--- /dev/null
+++ b/paddlex/inference/models/table_structure_recognition/modeling/slanext_modules/rec_vary_vit.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from functools import partial
+from typing import Optional, Tuple, Type
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import (
+    Constant,
+    KaimingUniform,
+    TruncatedNormal,
+    XavierUniform,
+)
+
+zeros_ = Constant(value=0.0)
+ones_ = Constant(value=1.0)
+kaiming_normal_ = KaimingUniform(nonlinearity="relu")
+trunc_normal_ = TruncatedNormal(std=0.02)
+xavier_uniform_ = XavierUniform()
+
+
+class DonutSwinModelOutput(OrderedDict):
+    last_hidden_state = None
+    pooler_output = None
+    hidden_states = None
+    attentions = None
+    reshaped_hidden_states = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = dict(self.items())
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        super().__setitem__(key, value)
+        super().__setattr__(key, value)
+
+    def to_tuple(self):
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+class MLPBlock(nn.Layer):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Layer] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x):
+        return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Layer):
+    def __init__(self, num_channels: int, epsilon: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = paddle.create_parameter([num_channels], dtype="float32")
+        ones_(self.weight)
+        self.bias = paddle.create_parameter([num_channels], dtype="float32")
+        zeros_(self.bias)
+        self.epsilon = epsilon
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.epsilon)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Layer):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Layer] = nn.LayerNorm,
+        act_layer: Type[nn.Layer] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+        is_formula: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Layer): Normalization layer.
+            act_layer (nn.Layer): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """
+        super().__init__()
+        self.img_size = img_size
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        self.pos_embed = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = paddle.create_parameter(
+                shape=(1, img_size // patch_size, img_size // patch_size, embed_dim),
+                dtype="float32",
+            )
+            zeros_(self.pos_embed)
+
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2D(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias_attr=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2D(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+
+        self.net_2 = nn.Conv2D(
+            256, 512, kernel_size=3, stride=2, padding=1, bias_attr=False
+        )
+        self.net_3 = nn.Conv2D(
+            512, 1024, kernel_size=3, stride=2, padding=1, bias_attr=False
+        )
+        self.is_formula = is_formula
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.neck(x.transpose([0, 3, 1, 2]))
+        x = self.net_2(x)
+        if self.is_formula:
+            x = self.net_3(x)
+        return x
+
+
+class Block(nn.Layer):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Layer] = nn.LayerNorm,
+        act_layer: Type[nn.Layer] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Layer): Normalization layer.
+            act_layer (nn.Layer): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(
+            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
+        )
+
+        self.window_size = window_size
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+
+        return x
+
+
+class Attention(nn.Layer):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert (
+                input_size is not None
+            ), "Input size must be provided if using relative positional encoding."
+            # initialize relative positional embeddings
+            self.rel_pos_h = paddle.create_parameter(
+                [2 * input_size[0] - 1, head_dim], dtype="float32"
+            )
+            zeros_(self.rel_pos_h)
+            self.rel_pos_w = paddle.create_parameter(
+                [2 * input_size[1] - 1, head_dim], dtype="float32"
+            )
+            zeros_(self.rel_pos_w)
+
+    def forward(self, x):
+
+        B, H, W, _ = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape([B, H * W, 3, self.num_heads, -1])
+            .transpose([2, 0, 3, 1, 4])
+        )
+        q, k, v = qkv.reshape([3, B * self.num_heads, H * W, -1]).unbind(0)
+        attn = (q * self.scale) @ k.transpose([0, 2, 1])
+
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(
+                attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
+            )
+        attn = F.softmax(attn, axis=-1)
+        x = (
+            (attn @ v)
+            .reshape([B, self.num_heads, H, W, -1])
+            .transpose([0, 2, 3, 1, 4])
+            .reshape([B, H, W, -1])
+        )
+        x = self.proj(x)
+
+        return x
+
+
+def window_partition(x, window_size: int):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h, 0, 0))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.reshape(
+        [B, Hp // window_size, window_size, Wp // window_size, window_size, C]
+    )
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C])
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
+):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.reshape(
+        [B, Hp // window_size, Wp // window_size, window_size, window_size, -1]
+    )
+    x = x.transpose([0, 1, 3, 2, 4, 5]).contiguous().reshape([B, Hp, Wp, -1])
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).transpose(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).transpose(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = paddle.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = paddle.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.cast(paddle.int64)]
+
+
+def add_decomposed_rel_pos(
+    attn,
+    q,
+    rel_pos_h,
+    rel_pos_w,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape([B, q_h, q_w, dim])
+    rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (
+        attn.reshape([B, q_h, q_w, k_h, k_w])
+        + rel_h[:, :, :, :, None]
+        + rel_w[:, :, :, None, :]
+    ).reshape([B, q_h * q_w, k_h * k_w])
+
+    return attn
+
+
+class PatchEmbed(nn.Layer):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=True,
+            bias_attr=True,
+        )
+
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.transpose([0, 2, 3, 1])
+        return x
+
+
+def _build_vary(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+    image_size,
+    is_formula=False,
+):
+    prompt_embed_dim = 256
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    image_encoder = ImageEncoderViT(
+        depth=encoder_depth,
+        embed_dim=encoder_embed_dim,
+        img_size=image_size,
+        mlp_ratio=4,
+        norm_layer=partial(paddle.nn.LayerNorm, epsilon=1e-6),
+        num_heads=encoder_num_heads,
+        patch_size=vit_patch_size,
+        qkv_bias=True,
+        use_rel_pos=True,
+        global_attn_indexes=encoder_global_attn_indexes,
+        window_size=14,
+        out_chans=prompt_embed_dim,
+        is_formula=is_formula,
+    )
+    return image_encoder
+
+
+class Vary_VIT_B(nn.Layer):
+    def __init__(
+        self,
+        in_channels=3,
+        image_size=768,
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+    ):
+        super().__init__()
+
+        self.vision_tower_high = _build_vary(
+            encoder_embed_dim=768,
+            encoder_depth=12,
+            encoder_num_heads=12,
+            encoder_global_attn_indexes=[2, 5, 8, 11],
+            image_size=image_size,
+        )
+
+        self.out_channels = 1024
+
+    def forward(self, input_data):
+        pixel_values = input_data
+        num_channels = pixel_values.shape[1]
+        if num_channels == 1:
+            pixel_values = paddle.repeat_interleave(pixel_values, repeats=3, axis=1)
+        cnn_feature = self.vision_tower_high(pixel_values)
+        cnn_feature = cnn_feature.flatten(2).transpose([0, 2, 1])
+        return cnn_feature
diff --git a/paddlex/inference/models/table_structure_recognition/modeling/slanext_modules/table_att_head.py b/paddlex/inference/models/table_structure_recognition/modeling/slanext_modules/table_att_head.py
new file mode 100644
index 0000000000..210c4600cb
--- /dev/null
+++ b/paddlex/inference/models/table_structure_recognition/modeling/slanext_modules/table_att_head.py
@@ -0,0 +1,508 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+def get_para_bias_attr(l2_decay, k):
+    if l2_decay > 0:
+        regularizer = paddle.regularizer.L2Decay(l2_decay)
+        stdv = 1.0 / math.sqrt(k * 1.0)
+        initializer = nn.initializer.Uniform(-stdv, stdv)
+    else:
+        regularizer = None
+        initializer = None
+    weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer)
+    bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer)
+    return [weight_attr, bias_attr]
+
+
+def drop_path(x, drop_prob=0.0, training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class AttentionGRUCell(nn.Layer):
+    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
+        super(AttentionGRUCell, self).__init__()
+        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
+        self.h2h = nn.Linear(hidden_size, hidden_size)
+        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
+
+        self.rnn = nn.GRUCell(
+            input_size=input_size + num_embeddings, hidden_size=hidden_size
+        )
+
+        self.hidden_size = hidden_size
+
+    def forward(self, prev_hidden, batch_H, char_onehots):
+        batch_H_proj = self.i2h(batch_H)
+        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
+
+        res = paddle.add(batch_H_proj, prev_hidden_proj)
+        res = paddle.tanh(res)
+        e = self.score(res)
+
+        alpha = F.softmax(e, axis=1)
+        alpha = paddle.transpose(alpha, [0, 2, 1])
+        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
+        concat_context = paddle.concat([context, char_onehots], 1)
+
+        cur_hidden = self.rnn(concat_context, prev_hidden)
+
+        return cur_hidden, alpha
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class TableAttentionHead(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        in_max_len=488,
+        max_text_length=800,
+        out_channels=30,
+        loc_reg_num=4,
+        **kwargs,
+    ):
+        super(TableAttentionHead, self).__init__()
+        self.input_size = in_channels[-1]
+        self.hidden_size = hidden_size
+        self.out_channels = out_channels
+        self.max_text_length = max_text_length
+
+        self.structure_attention_cell = AttentionGRUCell(
+            self.input_size, hidden_size, self.out_channels, use_gru=False
+        )
+        self.structure_generator = nn.Linear(hidden_size, self.out_channels)
+        self.in_max_len = in_max_len
+
+        if self.in_max_len == 640:
+            self.loc_fea_trans = nn.Linear(400, self.max_text_length + 1)
+        elif self.in_max_len == 800:
+            self.loc_fea_trans = nn.Linear(625, self.max_text_length + 1)
+        else:
+            self.loc_fea_trans = nn.Linear(256, self.max_text_length + 1)
+        self.loc_generator = nn.Linear(self.input_size + hidden_size, loc_reg_num)
+
+    def _char_to_onehot(self, input_char, onehot_dim):
+        input_ont_hot = F.one_hot(input_char, onehot_dim)
+        return input_ont_hot
+
+    def forward(self, inputs, targets=None):
+        # if and else branch are both needed when you want to assign a variable
+        # if you modify the var in just one branch, then the modification will not work.
+        fea = inputs[-1]
+        last_shape = int(np.prod(fea.shape[2:]))  # gry added
+        fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape])
+        fea = fea.transpose([0, 2, 1])  # (NTC)(batch, width, channels)
+        batch_size = fea.shape[0]
+
+        hidden = paddle.zeros((batch_size, self.hidden_size))
+        output_hiddens = paddle.zeros(
+            (batch_size, self.max_text_length + 1, self.hidden_size)
+        )
+        if self.training and targets is not None:
+            structure = targets[0]
+            for i in range(self.max_text_length + 1):
+                elem_onehots = self._char_to_onehot(
+                    structure[:, i], onehot_dim=self.out_channels
+                )
+                (outputs, hidden), alpha = self.structure_attention_cell(
+                    hidden, fea, elem_onehots
+                )
+                output_hiddens[:, i, :] = outputs
+            structure_probs = self.structure_generator(output_hiddens)
+            loc_fea = fea.transpose([0, 2, 1])
+            loc_fea = self.loc_fea_trans(loc_fea)
+            loc_fea = loc_fea.transpose([0, 2, 1])
+            loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2)
+            loc_preds = self.loc_generator(loc_concat)
+            loc_preds = F.sigmoid(loc_preds)
+        else:
+            temp_elem = paddle.zeros(shape=[batch_size], dtype="int32")
+            structure_probs = None
+            loc_preds = None
+            elem_onehots = None
+            outputs = None
+            alpha = None
+            max_text_length = paddle.to_tensor(self.max_text_length)
+            for i in range(max_text_length + 1):
+                elem_onehots = self._char_to_onehot(
+                    temp_elem, onehot_dim=self.out_channels
+                )
+                (outputs, hidden), alpha = self.structure_attention_cell(
+                    hidden, fea, elem_onehots
+                )
+                output_hiddens[:, i, :] = outputs
+                structure_probs_step = self.structure_generator(outputs)
+                temp_elem = structure_probs_step.argmax(axis=1, dtype="int32")
+
+            structure_probs = self.structure_generator(output_hiddens)
+            structure_probs = F.softmax(structure_probs)
+            loc_fea = fea.transpose([0, 2, 1])
+            loc_fea = self.loc_fea_trans(loc_fea)
+            loc_fea = loc_fea.transpose([0, 2, 1])
+            loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2)
+            loc_preds = self.loc_generator(loc_concat)
+            loc_preds = F.sigmoid(loc_preds)
+        return {"structure_probs": structure_probs, "loc_preds": loc_preds}
+
+
+class HWAttention(nn.Layer):
+    def __init__(
+        self,
+        head_dim=32,
+        qk_scale=None,
+        attn_drop=0.0,
+    ):
+        super().__init__()
+        self.head_dim = head_dim
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        C = C // 3
+        qkv = x.reshape([B, N, 3, C // self.head_dim, self.head_dim]).transpose(
+            [2, 0, 3, 1, 4]
+        )
+        q, k, v = qkv.unbind(0)
+        attn = q @ k.transpose([0, 1, 3, 2]) * self.scale
+        attn = F.softmax(attn, -1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+        x = x.transpose([0, 2, 1]).reshape([B, N, C])
+        return x
+
+
+def img2windows(img, H_sp, W_sp):
+    """
+    img: B C H W
+    """
+    B, H, W, C = img.shape
+    img_reshape = img.reshape([B, H // H_sp, H_sp, W // W_sp, W_sp, C])
+    img_perm = img_reshape.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H_sp * W_sp, C])
+    return img_perm
+
+
+def windows2img(img_splits_hw, H_sp, W_sp, H, W):
+    """
+    img_splits_hw: B' H W C
+    """
+    B = int(img_splits_hw.shape[0] / (H * W / H_sp / W_sp))
+
+    img = img_splits_hw.reshape([B, H // H_sp, W // W_sp, H_sp, W_sp, -1])
+    img = img.transpose([0, 1, 3, 2, 4, 5]).flatten(1, 4)
+    return img
+
+
+class Block(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        split_h=4,
+        split_w=4,
+        h_num_heads=None,
+        w_num_heads=None,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.split_h = split_h
+        self.split_w = split_w
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm1 = norm_layer(dim, epsilon=eps)
+        self.h_num_heads = h_num_heads if h_num_heads is not None else num_heads // 2
+        self.w_num_heads = w_num_heads if w_num_heads is not None else num_heads // 2
+        self.head_dim = dim // num_heads
+        self.mixer = HWAttention(
+            head_dim=dim // num_heads,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
+        self.norm2 = norm_layer(dim, epsilon=eps)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = x.flatten(2).transpose([0, 2, 1])
+
+        qkv = self.qkv(x).reshape([B, H, W, 3 * C])
+
+        x1 = qkv[:, :, :, : 3 * self.h_num_heads * self.head_dim]  # b, h, w, 3ch
+        x2 = qkv[:, :, :, 3 * self.h_num_heads * self.head_dim :]  # b, h, w, 3cw
+
+        x1 = self.mixer(img2windows(x1, self.split_h, W))  # b*splith, W, 3ch
+        x2 = self.mixer(img2windows(x2, H, self.split_w))  # b*splitw, h, 3ch
+        x1 = windows2img(x1, self.split_h, W, H, W)
+        x2 = windows2img(x2, H, self.split_w, H, W)
+
+        attened_x = paddle.concat([x1, x2], 2)
+        attened_x = self.proj(attened_x)
+
+        x = self.norm1(x + self.drop_path(attened_x))
+        x = self.norm2(x + self.drop_path(self.mlp(x)))
+        x = x.transpose([0, 2, 1]).reshape([-1, C, H, W])
+        return x
+
+
+class SLAHead(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        out_channels=30,
+        max_text_length=500,
+        loc_reg_num=4,
+        fc_decay=0.0,
+        use_attn=False,
+        **kwargs,
+    ):
+        """
+        @param in_channels: input shape
+        @param hidden_size: hidden_size for RNN and Embedding
+        @param out_channels: num_classes to rec
+        @param max_text_length: max text pred
+        """
+        super().__init__()
+
+        if isinstance(in_channels, int):
+            self.is_next = True
+            in_channels = 512
+        else:
+            self.is_next = False
+            in_channels = in_channels[-1]
+        self.hidden_size = hidden_size
+        self.max_text_length = max_text_length
+        self.emb = self._char_to_onehot
+        self.num_embeddings = out_channels
+        self.loc_reg_num = loc_reg_num
+        self.eos = self.num_embeddings - 1
+
+        # structure
+        self.structure_attention_cell = AttentionGRUCell(
+            in_channels, hidden_size, self.num_embeddings
+        )
+        weight_attr, bias_attr = get_para_bias_attr(l2_decay=fc_decay, k=hidden_size)
+        weight_attr1_1, bias_attr1_1 = get_para_bias_attr(
+            l2_decay=fc_decay, k=hidden_size
+        )
+        weight_attr1_2, bias_attr1_2 = get_para_bias_attr(
+            l2_decay=fc_decay, k=hidden_size
+        )
+        self.structure_generator = nn.Sequential(
+            nn.Linear(
+                self.hidden_size,
+                self.hidden_size,
+                weight_attr=weight_attr1_2,
+                bias_attr=bias_attr1_2,
+            ),
+            nn.Linear(
+                hidden_size, out_channels, weight_attr=weight_attr, bias_attr=bias_attr
+            ),
+        )
+        dpr = np.linspace(0, 0.1, 2)
+
+        self.use_attn = use_attn
+        if use_attn:
+            layer_list = [
+                Block(
+                    in_channels,
+                    num_heads=2,
+                    mlp_ratio=4.0,
+                    qkv_bias=True,
+                    drop_path=dpr[i],
+                )
+                for i in range(2)
+            ]
+            self.cross_atten = nn.Sequential(*layer_list)
+        # loc
+        weight_attr1, bias_attr1 = get_para_bias_attr(
+            l2_decay=fc_decay, k=self.hidden_size
+        )
+        weight_attr2, bias_attr2 = get_para_bias_attr(
+            l2_decay=fc_decay, k=self.hidden_size
+        )
+        self.loc_generator = nn.Sequential(
+            nn.Linear(
+                self.hidden_size,
+                self.hidden_size,
+                weight_attr=weight_attr1,
+                bias_attr=bias_attr1,
+            ),
+            nn.Linear(
+                self.hidden_size,
+                loc_reg_num,
+                weight_attr=weight_attr2,
+                bias_attr=bias_attr2,
+            ),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, inputs, targets=None):
+        if self.is_next == True:
+            fea = inputs
+            batch_size = fea.shape[0]
+        else:
+            fea = inputs[-1]
+            batch_size = fea.shape[0]
+            if self.use_attn:
+                fea = fea + self.cross_atten(fea)
+            # reshape
+            fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], -1])
+            fea = fea.transpose([0, 2, 1])  # (NTC)(batch, width, channels)
+
+        hidden = paddle.zeros((batch_size, self.hidden_size))
+        structure_preds = paddle.zeros(
+            (batch_size, self.max_text_length + 1, self.num_embeddings)
+        )
+        loc_preds = paddle.zeros(
+            (batch_size, self.max_text_length + 1, self.loc_reg_num)
+        )
+        structure_preds.stop_gradient = True
+        loc_preds.stop_gradient = True
+
+        if self.training and targets is not None:
+            structure = targets[0]
+            max_len = targets[-2].max().astype("int32")
+            for i in range(max_len + 1):
+                hidden, structure_step, loc_step = self._decode(
+                    structure[:, i], fea, hidden
+                )
+                structure_preds[:, i, :] = structure_step
+                loc_preds[:, i, :] = loc_step
+            structure_preds = structure_preds[:, : max_len + 1]
+            loc_preds = loc_preds[:, : max_len + 1]
+        else:
+            structure_ids = paddle.zeros(
+                (batch_size, self.max_text_length + 1), dtype="int32"
+            )
+            pre_chars = paddle.zeros(shape=[batch_size], dtype="int32")
+            max_text_length = paddle.to_tensor(self.max_text_length)
+            for i in range(max_text_length + 1):
+                hidden, structure_step, loc_step = self._decode(pre_chars, fea, hidden)
+                pre_chars = structure_step.argmax(axis=1, dtype="int32")
+                structure_preds[:, i, :] = structure_step
+                loc_preds[:, i, :] = loc_step
+
+                structure_ids[:, i] = pre_chars
+                if (structure_ids == self.eos).any(-1).all():
+                    break
+        if not self.training:
+            structure_preds = F.softmax(structure_preds[:, : i + 1])
+            loc_preds = loc_preds[:, : i + 1]
+        return {"structure_probs": structure_preds, "loc_preds": loc_preds}
+
+    def _decode(self, pre_chars, features, hidden):
+        """
+        Predict table label and coordinates for each step
+        @param pre_chars: Table label in previous step
+        @param features:
+        @param hidden: hidden status in previous step
+        @return:
+        """
+        emb_feature = self.emb(pre_chars)
+        # output shape is b * self.hidden_size
+        (output, hidden), alpha = self.structure_attention_cell(
+            hidden, features, emb_feature
+        )
+
+        # structure
+        structure_step = self.structure_generator(output)
+        # loc
+        loc_step = self.loc_generator(output)
+        return hidden, structure_step, loc_step
+
+    def _char_to_onehot(self, input_char):
+        input_ont_hot = F.one_hot(input_char, self.num_embeddings)
+        return input_ont_hot
diff --git a/paddlex/inference/models/table_structure_recognition/predictor.py b/paddlex/inference/models/table_structure_recognition/predictor.py
index cecc655769..f94c76ddaf 100644
--- a/paddlex/inference/models/table_structure_recognition/predictor.py
+++ b/paddlex/inference/models/table_structure_recognition/predictor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ....modules.table_recognition.model_list import MODELS
+from ....utils.device import TemporaryDeviceChanger
 from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
 from ...common.reader import ReadImage
@@ -34,6 +35,7 @@ class TablePredictor(BasePredictor):
 
     def __init__(self, *args: List, **kwargs: Dict) -> None:
         super().__init__(*args, **kwargs)
+        self.device = kwargs.get("device", None)
         self.preprocessors, self.infer, self.postprocessors = self._build()
 
     def _build_batch_sampler(self) -> ImageBatchSampler:
@@ -53,7 +55,24 @@ def _build(self) -> Tuple:
                 preprocessors.append(op)
         preprocessors.append(ToBatch())
 
-        infer = self.create_static_infer()
+        if self._use_static_model:
+            infer = self.create_static_infer()
+        else:
+            if self.model_name in ["SLANeXt_wired", "SLANeXt_wireless"]:
+                from .modeling import SLANeXt
+
+                with TemporaryDeviceChanger(self.device):
+                    infer = SLANeXt.from_pretrained(
+                        self.model_dir,
+                        use_safetensors=True,
+                        convert_from_hf=True,
+                        dtype="float32",
+                    )
+                    infer.eval()
+            else:
+                raise RuntimeError(
+                    f"There is no dynamic graph implementation for model {repr(self.model_name)}."
+                )
 
         postprocessors = TableLabelDecode(
             model_name=self.config["Global"]["model_name"],
@@ -89,7 +108,11 @@ def process(self, batch_data: List[Union[str, np.ndarray]]) -> Dict[str, Any]:
         batch_imgs = self.preprocessors[4](imgs=pad_imgs)  # ToCHWImage
         x = self.preprocessors[5](imgs=batch_imgs)  # ToBatch
 
-        batch_preds = self.infer(x=x)
+        if self._use_static_model:
+            batch_preds = self.infer(x=x)
+        else:
+            with TemporaryDeviceChanger(self.device):
+                batch_preds = self.infer(x=x)
 
         table_result = self.postprocessors(
             pred=batch_preds,
diff --git a/paddlex/inference/models/text_detection/modeling/__init__.py b/paddlex/inference/models/text_detection/modeling/__init__.py
new file mode 100644
index 0000000000..6bc7a4fe06
--- /dev/null
+++ b/paddlex/inference/models/text_detection/modeling/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pp_ocrv5_mobile_det import PPOCRV5MobileDet
+from .pp_ocrv5_server_det import PPOCRV5ServerDet
diff --git a/paddlex/inference/models/text_detection/modeling/_config_pp_ocrv5_mobile.py b/paddlex/inference/models/text_detection/modeling/_config_pp_ocrv5_mobile.py
new file mode 100644
index 0000000000..8f70289ddd
--- /dev/null
+++ b/paddlex/inference/models/text_detection/modeling/_config_pp_ocrv5_mobile.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...common.transformers.transformers import PretrainedConfig
+
+DEFAULT_CONFIG = {
+    "model_name": "PP-OCRv5_mobile_det",
+    "algorithm": "DB",
+    "backbone": {
+        "name": "PPLCNetV3",
+        "scale": 1.0,
+        "det": True,
+        "conv_kxk_num": 4,
+        "reduction": 4,
+        "lr_mult_list": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        "lab_lr": 0.1,
+        "out_channels": 512,
+        "act": "hswish",
+        "net_config": {
+            "blocks2": [[3, 16, 24, 1, False]],
+            "blocks3": [[3, 24, 48, 2, False], [3, 48, 48, 1, False]],
+            "blocks4": [[3, 48, 96, 2, False], [3, 96, 96, 1, False]],
+            "blocks5": [
+                [3, 96, 192, 2, False],
+                [5, 192, 192, 1, False],
+                [5, 192, 192, 1, False],
+                [5, 192, 192, 1, False],
+                [5, 192, 192, 1, False],
+            ],
+            "blocks6": [
+                [5, 192, 384, 2, True],
+                [5, 384, 384, 1, True],
+                [5, 384, 384, 1, False],
+                [5, 384, 384, 1, False],
+            ],
+            "layer_list_out_channels": [12, 18, 42, 360],
+        },
+    },
+    "neck": {"name": "RSEFPN", "out_channels": 96, "shortcut": True},
+    "head": {"name": "DBHead", "k": 50, "kernel_list": [3, 2, 2], "fix_nan": False},
+}
+
+
+class PPOCRV5MobileDetConfig(PretrainedConfig):
+    model_type = "det"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.model_name = kwargs.get("model_name", DEFAULT_CONFIG["model_name"])
+        self.algorithm = kwargs.get("algorithm", DEFAULT_CONFIG["algorithm"])
+
+        backbone_cfg = kwargs.get("backbone", DEFAULT_CONFIG["backbone"])
+        self.backbone_name = backbone_cfg.get(
+            "name", DEFAULT_CONFIG["backbone"]["name"]
+        )
+        self.backbone_scale = backbone_cfg.get(
+            "scale", DEFAULT_CONFIG["backbone"]["scale"]
+        )
+        self.backbone_det = backbone_cfg.get("det", DEFAULT_CONFIG["backbone"]["det"])
+        self.backbone_conv_kxk_num = backbone_cfg.get(
+            "conv_kxk_num", DEFAULT_CONFIG["backbone"]["conv_kxk_num"]
+        )
+        self.backbone_reduction = backbone_cfg.get(
+            "reduction", DEFAULT_CONFIG["backbone"]["reduction"]
+        )
+        self.backbone_lr_mult_list = backbone_cfg.get(
+            "lr_mult_list", DEFAULT_CONFIG["backbone"]["lr_mult_list"]
+        )
+        self.backbone_lab_lr = backbone_cfg.get(
+            "lab_lr", DEFAULT_CONFIG["backbone"]["lab_lr"]
+        )
+        self.backbone_net_config = backbone_cfg.get(
+            "net_config", DEFAULT_CONFIG["backbone"]["net_config"]
+        )
+        self.backbone_out_channels = backbone_cfg.get(
+            "out_channels", DEFAULT_CONFIG["backbone"]["out_channels"]
+        )
+        self.backbone_act = backbone_cfg.get("act", DEFAULT_CONFIG["backbone"]["act"])
+
+        neck_cfg = kwargs.get("neck", DEFAULT_CONFIG["neck"])
+        self.neck_name = neck_cfg.get("name", DEFAULT_CONFIG["neck"]["name"])
+        self.neck_out_channels = neck_cfg.get(
+            "out_channels", DEFAULT_CONFIG["neck"]["out_channels"]
+        )
+        self.neck_shortcut = neck_cfg.get(
+            "shortcut", DEFAULT_CONFIG["neck"]["shortcut"]
+        )
+
+        head_cfg = kwargs.get("head", DEFAULT_CONFIG["head"])
+        self.head_name = head_cfg.get("name", DEFAULT_CONFIG["head"]["name"])
+        self.head_k = head_cfg.get("k", DEFAULT_CONFIG["head"]["k"])
+        self.head_kernel_list = head_cfg.get(
+            "kernel_list", DEFAULT_CONFIG["head"]["kernel_list"]
+        )
+        self.head_fix_nan = head_cfg.get("fix_nan", DEFAULT_CONFIG["head"]["fix_nan"])
diff --git a/paddlex/inference/models/text_detection/modeling/_config_pp_ocrv5_server.py b/paddlex/inference/models/text_detection/modeling/_config_pp_ocrv5_server.py
new file mode 100644
index 0000000000..a17540a068
--- /dev/null
+++ b/paddlex/inference/models/text_detection/modeling/_config_pp_ocrv5_server.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...common.transformers.transformers import PretrainedConfig
+
+DEFAULT_CONFIG = {
+    "model_type": "det",
+    "model_name": "PP-OCRv5_server_det",
+    "algorithm": "DB",
+    "upsample_mode": "nearest",
+    "upsample_align_mode": 1,
+    "backbone": {
+        "name": "PPHGNetV2",
+        "stem_channels": [3, 32, 48],
+        "stage_config": {
+            "stage1": [48, 48, 128, 1, False, False, 3, 6, 2],
+            "stage2": [128, 96, 512, 1, True, False, 3, 6, 2],
+            "stage3": [512, 192, 1024, 3, True, True, 5, 6, 2],
+            "stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2],
+        },
+        "use_lab": False,
+        "use_last_conv": True,
+        "class_expand": 2048,
+        "dropout_prob": 0.0,
+        "class_num": 1000,
+        "lr_mult_list": [1.0, 1.0, 1.0, 1.0, 1.0],
+        "det": True,
+        "out_indices": [0, 1, 2, 3],
+    },
+    "neck": {
+        "name": "LKPAN",
+        "out_channels": 256,
+        "mode": "large",
+        "reduce_factor": 2,
+        "intraclblock_config": {
+            "reduce_channel": [1, 1, 0],
+            "return_channel": [1, 1, 0],
+            "v_layer_7x1": [[7, 1], [1, 1], [3, 0]],
+            "v_layer_5x1": [[5, 1], [1, 1], [2, 0]],
+            "v_layer_3x1": [[3, 1], [1, 1], [1, 0]],
+            "q_layer_1x7": [[1, 7], [1, 1], [0, 3]],
+            "q_layer_1x5": [[1, 5], [1, 1], [0, 2]],
+            "q_layer_1x3": [[1, 3], [1, 1], [0, 1]],
+            "c_layer_7x7": [[7, 7], [1, 1], [3, 3]],
+            "c_layer_5x5": [[5, 5], [1, 1], [2, 2]],
+            "c_layer_3x3": [[3, 3], [1, 1], [1, 1]],
+        },
+    },
+    "head": {
+        "name": "PFHeadLocal",
+        "in_channels": 1024,
+        "k": 50,
+        "mode": "large",
+        "scale_factor": 2,
+        "act": "relu",
+        "kernel_list": [3, 2, 2],
+        "fix_nan": False,
+    },
+}
+
+
+class PPOCRV5ServerDetConfig(PretrainedConfig):
+    model_type = "det"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.model_name = kwargs.get("model_name", DEFAULT_CONFIG["model_name"])
+        self.algorithm = kwargs.get("algorithm", DEFAULT_CONFIG["algorithm"])
+        self.upsample_mode = kwargs.get(
+            "upsample_mode", DEFAULT_CONFIG["upsample_mode"]
+        )
+        self.upsample_align_mode = kwargs.get(
+            "upsample_align_mode", DEFAULT_CONFIG["upsample_align_mode"]
+        )
+
+        backbone_cfg = kwargs.get("backbone", DEFAULT_CONFIG["backbone"])
+        self.backbone_name = backbone_cfg.get(
+            "name", DEFAULT_CONFIG["backbone"]["name"]
+        )
+        self.backbone_stem_channels = backbone_cfg.get(
+            "stem_channels", DEFAULT_CONFIG["backbone"]["stem_channels"]
+        )
+        self.backbone_stage_config = backbone_cfg.get(
+            "stage_config", DEFAULT_CONFIG["backbone"]["stage_config"]
+        )
+        self.backbone_use_lab = backbone_cfg.get(
+            "use_lab", DEFAULT_CONFIG["backbone"]["use_lab"]
+        )
+        self.backbone_use_last_conv = backbone_cfg.get(
+            "use_last_conv", DEFAULT_CONFIG["backbone"]["use_last_conv"]
+        )
+        self.backbone_class_expand = backbone_cfg.get(
+            "class_expand", DEFAULT_CONFIG["backbone"]["class_expand"]
+        )
+        self.backbone_class_num = backbone_cfg.get(
+            "class_num", DEFAULT_CONFIG["backbone"]["class_num"]
+        )
+        self.backbone_lr_mult_list = backbone_cfg.get(
+            "lr_mult_list", DEFAULT_CONFIG["backbone"]["lr_mult_list"]
+        )
+        self.backbone_det = backbone_cfg.get("det", DEFAULT_CONFIG["backbone"]["det"])
+        self.backbone_out_indices = backbone_cfg.get(
+            "out_indices", DEFAULT_CONFIG["backbone"]["out_indices"]
+        )
+
+        neck_cfg = kwargs.get("neck", DEFAULT_CONFIG["neck"])
+        self.neck_name = neck_cfg.get("name", DEFAULT_CONFIG["neck"]["name"])
+        self.neck_out_channels = neck_cfg.get(
+            "out_channels", DEFAULT_CONFIG["neck"]["out_channels"]
+        )
+        self.neck_mode = neck_cfg.get("mode", DEFAULT_CONFIG["neck"]["mode"])
+        self.neck_reduce_factor = neck_cfg.get(
+            "reduce_factor", DEFAULT_CONFIG["neck"]["reduce_factor"]
+        )
+        self.neck_intraclblock_config = neck_cfg.get(
+            "intraclblock_config", DEFAULT_CONFIG["neck"]["intraclblock_config"]
+        )
+
+        head_cfg = kwargs.get("head", DEFAULT_CONFIG["head"])
+        self.head_name = head_cfg.get("name", DEFAULT_CONFIG["head"]["name"])
+        self.head_in_channels = head_cfg.get(
+            "in_channels", DEFAULT_CONFIG["head"]["in_channels"]
+        )
+        self.head_k = head_cfg.get("k", DEFAULT_CONFIG["head"]["k"])
+        self.head_mode = head_cfg.get("mode", DEFAULT_CONFIG["head"]["mode"])
+        self.head_scale_factor = head_cfg.get(
+            "scale_factor", DEFAULT_CONFIG["head"]["scale_factor"]
+        )
+        self.head_act = head_cfg.get("act", DEFAULT_CONFIG["head"]["act"])
+        self.head_kernel_list = head_cfg.get(
+            "kernel_list", DEFAULT_CONFIG["head"]["kernel_list"]
+        )
+        self.head_fix_nan = head_cfg.get("fix_nan", DEFAULT_CONFIG["head"]["fix_nan"])
diff --git a/paddlex/inference/models/text_detection/modeling/pp_ocrv5_mobile_det.py b/paddlex/inference/models/text_detection/modeling/pp_ocrv5_mobile_det.py
new file mode 100644
index 0000000000..da473439df
--- /dev/null
+++ b/paddlex/inference/models/text_detection/modeling/pp_ocrv5_mobile_det.py
@@ -0,0 +1,709 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ....utils.benchmark import add_inference_operations, benchmark
+from ...common.transformers.transformers import (
+    BatchNormHFStateDictMixin,
+    PretrainedModel,
+)
+from ._config_pp_ocrv5_mobile import PPOCRV5MobileDetConfig
+from .pp_ocrv5_modules import DBHead, LearnableAffineBlock
+
+
+def make_divisible(
+    v: Union[int, float], divisor: int = 16, min_value: Optional[int] = None
+) -> int:
+    """
+    make_divisible: Adjust channel number to be divisible by specified divisor (network width optimization)
+
+    Args:
+        v (Union[int, float]): Original channel number
+        divisor (int, optional): Divisor for channel adjustment, default 16
+        min_value (Optional[int], optional): Minimum channel number after adjustment, default None
+
+    Returns:
+        int: Adjusted channel number (integer)
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class Act(nn.Layer):
+    """
+    Act: Activation layer with Learnable Affine Block (LAB)
+
+    Args:
+        act (str): Activation type, "relu" or "hswish"
+        lr_mult (float): Learning rate multiplier for LAB alpha
+        lab_lr (float): Learning rate multiplier for LAB beta
+
+    Returns:
+        paddle.Tensor: Output tensor after activation and LAB
+    """
+
+    def __init__(self, act: str, lr_mult: float, lab_lr: float):
+        super().__init__()
+        if act == "hswish":
+            self.act = nn.Hardswish()
+        else:
+            assert act == "relu"
+            self.act = nn.ReLU()
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        return self.lab(self.act(x))
+
+
+class ConvBNLayer(nn.Layer):
+    """
+    ConvBNLayer: Convolution + Batch Normalization combination layer
+
+    Args:
+        in_channels (int): Input channel number
+        out_channels (int): Output channel number
+        kernel_size (int): Convolution kernel size
+        stride (int): Convolution stride
+        lr_mult (float): Learning rate multiplier for conv/BN params
+        groups (int, optional): Group convolution number, default 1
+
+    Returns:
+        paddle.Tensor: Output tensor after conv and BN
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        lr_mult: float,
+        groups: int = 1,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal(), learning_rate=lr_mult),
+            bias_attr=False,
+        )
+
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class LearnableRepLayer(nn.Layer):
+    """
+    LearnableRepLayer: Learnable representation layer with multi-branch convolution fusion
+
+    Args:
+        in_channels (int): Input channel number
+        out_channels (int): Output channel number
+        kernel_size (int): Convolution kernel size
+        act (str): Activation type, "relu" or "hswish"
+        stride (int): Convolution stride
+        lr_mult (float): Learning rate multiplier for conv/BN/LAB params
+        lab_lr (float): Learning rate multiplier for LAB beta
+        num_conv_branches (int): Number of kxk conv branches
+        groups (int, optional): Group convolution number, default 1
+
+    Returns:
+        paddle.Tensor: Output tensor after rep branches fusion and activation
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        act: str,
+        stride: int,
+        lr_mult: float,
+        lab_lr: float,
+        num_conv_branches: int,
+        groups: int = 1,
+    ):
+        super().__init__()
+        self.groups = groups
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_conv_branches = num_conv_branches
+        self.padding = (kernel_size - 1) // 2
+
+        self.identity = (
+            nn.BatchNorm2D(
+                num_features=in_channels,
+                weight_attr=ParamAttr(learning_rate=lr_mult),
+                bias_attr=ParamAttr(learning_rate=lr_mult),
+            )
+            if out_channels == in_channels and stride == 1
+            else None
+        )
+
+        self.conv_kxk = nn.LayerList(
+            [
+                ConvBNLayer(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    lr_mult=lr_mult,
+                )
+                for _ in range(self.num_conv_branches)
+            ]
+        )
+
+        self.conv_1x1 = (
+            ConvBNLayer(
+                in_channels, out_channels, 1, stride, groups=groups, lr_mult=lr_mult
+            )
+            if kernel_size > 1
+            else None
+        )
+
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+        self.act = Act(act=act, lr_mult=lr_mult, lab_lr=lab_lr)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+
+        out = 0
+        if self.identity is not None:
+            out += self.identity(x)
+
+        if self.conv_1x1 is not None:
+            out += self.conv_1x1(x)
+
+        for conv in self.conv_kxk:
+            out += conv(x)
+
+        out = self.lab(out)
+        if self.stride != 2:
+            out = self.act(out)
+        return out
+
+
+class SELayer(nn.Layer):
+    """
+    SELayer: Squeeze-and-Excitation channel attention layer
+
+    Args:
+        channel (int): Input/output channel number
+        reduction (int): Channel reduction ratio for excitation
+        lr_mult (float): Learning rate multiplier for conv params
+
+    Returns:
+        paddle.Tensor: Output tensor after channel attention weighting
+    """
+
+    def __init__(self, channel: int, reduction: int, lr_mult: float):
+        super().__init__()
+        if "npu" in paddle.device.get_device():
+            self.avg_pool = nn.MeanPool2D(1, 1)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class LCNetV3Block(nn.Layer):
+    """
+    LCNetV3Block: Depthwise separable convolution block with SE attention (LCNetV3)
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        act (str): Activation function type
+        stride (int): Convolution stride for depthwise conv
+        dw_size (int): Kernel size of depthwise convolution
+        use_se (bool): Whether to use SE channel attention module
+        conv_kxk_num (int): Number of conv branches in LearnableRepLayer
+        reduction (int): Channel reduction ratio for SE module
+        lr_mult (float): Learning rate multiplier for convolution parameters
+        lab_lr (float): Learning rate multiplier for learnable representation layer
+
+    Returns:
+        paddle.Tensor: Output tensor after depthwise conv, SE (optional) and pointwise conv
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        act: str,
+        stride: int,
+        dw_size: int,
+        use_se: bool,
+        conv_kxk_num: int,
+        reduction: int,
+        lr_mult: float,
+        lab_lr: float,
+    ):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=dw_size,
+            act=act,
+            stride=stride,
+            groups=in_channels,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr,
+        )
+        if use_se:
+            self.se = SELayer(in_channels, reduction=reduction, lr_mult=lr_mult)
+        self.pw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act=act,
+            stride=1,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr,
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class PPLCNetV3(nn.Layer):
+    """
+    PPLCNetV3: Lightweight convolutional network with learnable representation layers
+
+    Args:
+        scale (float): Channel scale factor for network width adjustment
+        conv_kxk_num (int): Number of conv branches in LearnableRepLayer of LCNetV3Block
+        reduction (int): Channel reduction ratio for SE module in LCNetV3Block
+        act (str): Activation function type used in LCNetV3Block
+        lr_mult_list (List[float]): Learning rate multipliers for different layers, length must be 6
+        lab_lr (float): Learning rate multiplier for learnable representation layer in LCNetV3Block
+        net_config (Dict[str, Any]): Network configuration dict containing block parameters and output channels
+        out_channels (int): Base number of output channels before scale adjustment
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        List[paddle.Tensor]: List of 4 feature tensors from different stages after 1x1 conv projection
+    """
+
+    def __init__(
+        self,
+        scale: float,
+        conv_kxk_num: int,
+        reduction: int,
+        act: str,
+        lr_mult_list: List[float],
+        lab_lr: float,
+        net_config: Dict[str, Any],
+        out_channels: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self.scale = scale
+        self.lr_mult_list = lr_mult_list
+
+        self.net_config = net_config
+        self.out_channels = make_divisible(out_channels * scale)
+
+        assert isinstance(
+            self.lr_mult_list, (list, tuple)
+        ), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list)
+        )
+        assert (
+            len(self.lr_mult_list) == 6
+        ), "lr_mult_list length should be 6 but got {}".format(len(self.lr_mult_list))
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=make_divisible(16 * scale),
+            kernel_size=3,
+            stride=2,
+            lr_mult=self.lr_mult_list[0],
+        )
+
+        def _build_blocks(block_key, lr_mult_idx):
+            return nn.Sequential(
+                *[
+                    LCNetV3Block(
+                        in_channels=make_divisible(in_c * scale),
+                        out_channels=make_divisible(out_c * scale),
+                        act=act,
+                        dw_size=k,
+                        stride=s,
+                        use_se=se,
+                        conv_kxk_num=conv_kxk_num,
+                        reduction=reduction,
+                        lr_mult=self.lr_mult_list[lr_mult_idx],
+                        lab_lr=lab_lr,
+                    )
+                    for i, (k, in_c, out_c, s, se) in enumerate(
+                        self.net_config[block_key]
+                    )
+                ]
+            )
+
+        self.blocks2 = _build_blocks("blocks2", 1)
+        self.blocks3 = _build_blocks("blocks3", 2)
+        self.blocks4 = _build_blocks("blocks4", 3)
+        self.blocks5 = _build_blocks("blocks5", 4)
+        self.blocks6 = _build_blocks("blocks6", 5)
+
+        mv_c = self.net_config["layer_list_out_channels"]
+
+        self.out_channels = [
+            make_divisible(self.net_config["blocks3"][-1][2] * scale),
+            make_divisible(self.net_config["blocks4"][-1][2] * scale),
+            make_divisible(self.net_config["blocks5"][-1][2] * scale),
+            make_divisible(self.net_config["blocks6"][-1][2] * scale),
+        ]
+
+        self.layer_list = nn.LayerList(
+            [
+                nn.Conv2D(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0),
+                nn.Conv2D(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0),
+                nn.Conv2D(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0),
+                nn.Conv2D(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0),
+            ]
+        )
+        self.out_channels = [
+            int(mv_c[0] * scale),
+            int(mv_c[1] * scale),
+            int(mv_c[2] * scale),
+            int(mv_c[3] * scale),
+        ]
+
+    def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
+        out_list = []
+        x = self.conv1(x)
+
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        out_list.append(x)
+        x = self.blocks4(x)
+        out_list.append(x)
+        x = self.blocks5(x)
+        out_list.append(x)
+        x = self.blocks6(x)
+        out_list.append(x)
+
+        out_list[0] = self.layer_list[0](out_list[0])
+        out_list[1] = self.layer_list[1](out_list[1])
+        out_list[2] = self.layer_list[2](out_list[2])
+        out_list[3] = self.layer_list[3](out_list[3])
+        return out_list
+
+
+class SEModule(nn.Layer):
+    """
+    SEModule: Simplified Squeeze-and-Excitation channel attention module
+
+    Args:
+        in_channels (int): Number of input channels
+        reduction (int): Channel reduction ratio for excitation layer
+
+    Returns:
+        paddle.Tensor: Output tensor after channel attention weighting
+    """
+
+    def __init__(self, in_channels: int, reduction: int):
+        super(SEModule, self).__init__()
+        if "npu" in paddle.device.get_device():
+            self.avg_pool = nn.MeanPool2D(1, 1)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv2 = nn.Conv2D(
+            in_channels=in_channels // reduction,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return inputs * outputs
+
+
+class RSELayer(nn.Layer):
+    """
+    RSELayer: Residual SE layer with convolution and shortcut connection
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of convolution layer
+        shortcut (bool): Whether to add shortcut connection (residual) with SE output
+        reduction (int): Channel reduction ratio for SE module
+
+    Returns:
+        paddle.Tensor: Output tensor after convolution, SE attention and optional shortcut
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        shortcut: bool,
+        reduction: int,
+    ):
+        super(RSELayer, self).__init__()
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+        self.out_channels = out_channels
+        self.in_conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            kernel_size=kernel_size,
+            padding=int(kernel_size // 2),
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False,
+        )
+        self.se_block = SEModule(self.out_channels, reduction=reduction)
+        self.shortcut = shortcut
+
+    def forward(self, ins: paddle.Tensor) -> paddle.Tensor:
+        x = self.in_conv(ins)
+        if self.shortcut:
+            out = x + self.se_block(x)
+        else:
+            out = self.se_block(x)
+        return out
+
+
+class RSEFPN(nn.Layer):
+    """
+    RSEFPN: Feature Pyramid Network with Residual SE attention
+
+    Args:
+        in_channels (List[int]): List of input channel numbers for multi-scale feature maps
+        out_channels (int): Number of output channels for RSELayer convolution
+        shortcut (bool): Whether to use shortcut connection in RSELayer
+        reduction (int): Channel reduction ratio for SE module in RSELayer
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        paddle.Tensor: Fused feature tensor after multi-scale feature aggregation and concatenation
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        shortcut: bool,
+        reduction: int,
+        **kwargs,
+    ):
+        super(RSEFPN, self).__init__()
+        self.out_channels = out_channels
+        self.ins_conv = nn.LayerList()
+        self.inp_conv = nn.LayerList()
+
+        for i in range(len(in_channels)):
+            self.ins_conv.append(
+                RSELayer(
+                    in_channels[i],
+                    out_channels,
+                    kernel_size=1,
+                    shortcut=shortcut,
+                    reduction=reduction,
+                )
+            )
+            self.inp_conv.append(
+                RSELayer(
+                    out_channels,
+                    out_channels // 4,
+                    kernel_size=3,
+                    shortcut=shortcut,
+                    reduction=reduction,
+                )
+            )
+
+    def forward(self, x: List[paddle.Tensor]) -> paddle.Tensor:
+        c2, c3, c4, c5 = x
+
+        in5 = self.ins_conv[3](c5)
+        in4 = self.ins_conv[2](c4)
+        in3 = self.ins_conv[1](c3)
+        in2 = self.ins_conv[0](c2)
+
+        out4 = in4 + F.upsample(
+            in5, scale_factor=2, mode="nearest", align_mode=1
+        )  # 1/16
+        out3 = in3 + F.upsample(
+            out4, scale_factor=2, mode="nearest", align_mode=1
+        )  # 1/8
+        out2 = in2 + F.upsample(
+            out3, scale_factor=2, mode="nearest", align_mode=1
+        )  # 1/4
+
+        p5 = self.inp_conv[3](in5)
+        p4 = self.inp_conv[2](out4)
+        p3 = self.inp_conv[1](out3)
+        p2 = self.inp_conv[0](out2)
+
+        p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1)
+        p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1)
+        p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1)
+
+        fuse = paddle.concat([p5, p4, p3, p2], axis=1)
+        return fuse
+
+
+class PPOCRV5MobileDet(BatchNormHFStateDictMixin, PretrainedModel):
+    """
+    PPOCRV5MobileDet: Lightweight OCR detection model based on PPLCNetV3, RSEFPN and DBHead
+
+    Args:
+        config (PPOCRV5MobileDetConfig): Configuration object containing model hyperparameters
+
+    Returns:
+        List: List containing the detection output tensor (converted to numpy array on CPU)
+    """
+
+    config_class = PPOCRV5MobileDetConfig
+
+    def __init__(self, config: PPOCRV5MobileDetConfig):
+        super().__init__(config)
+
+        self.backbone_scale = config.backbone_scale
+        self.backbone_det = config.backbone_det
+        self.backbone_conv_kxk_num = config.backbone_conv_kxk_num
+        self.backbone_reduction = config.backbone_reduction
+        self.backbone_act = config.backbone_act
+        self.backbone_lr_mult_list = config.backbone_lr_mult_list
+        self.backbone_lab_lr = config.backbone_lab_lr
+        self.backbone_net_config = config.backbone_net_config
+        self.backbone_out_channels = config.backbone_out_channels
+
+        self.neck_out_channels = config.neck_out_channels
+        self.neck_shortcut = config.neck_shortcut
+
+        self.head_k = config.head_k
+        self.head_kernel_list = config.head_kernel_list
+        self.head_fix_nan = config.head_fix_nan
+
+        self.backbone = PPLCNetV3(
+            scale=self.backbone_scale,
+            conv_kxk_num=self.backbone_conv_kxk_num,
+            reduction=self.backbone_reduction,
+            act=self.backbone_act,
+            lr_mult_list=self.backbone_lr_mult_list,
+            lab_lr=self.backbone_lab_lr,
+            net_config=self.backbone_net_config,
+            out_channels=self.backbone_out_channels,
+        )
+
+        neck_in_channels = self.backbone.out_channels
+        self.neck = RSEFPN(
+            in_channels=neck_in_channels,
+            out_channels=self.neck_out_channels,
+            shortcut=self.neck_shortcut,
+            reduction=self.backbone_reduction,
+        )
+
+        head_in_channels = self.neck_out_channels
+        self.head = DBHead(
+            in_channels=head_in_channels,
+            k=self.head_k,
+            kernel_list=self.head_kernel_list,
+            fix_nan=self.head_fix_nan,
+        )
+
+    add_inference_operations("pp_ocrv5_mobile_det_forward")
+
+    @benchmark.timeit_with_options(name="pp_ocrv5_mobile_det_forward")
+    def forward(self, x: List) -> List:
+
+        if isinstance(x, (list, tuple)):
+            x = x[0]
+        if not isinstance(x, paddle.Tensor):
+            x = paddle.to_tensor(x)
+
+        x = self.backbone(x)
+        x = self.neck(x)
+        x = self.head(x)
+
+        return [x.cpu().numpy()]
diff --git a/paddlex/inference/models/text_detection/modeling/pp_ocrv5_modules.py b/paddlex/inference/models/text_detection/modeling/pp_ocrv5_modules.py
new file mode 100644
index 0000000000..0e83ec91c4
--- /dev/null
+++ b/paddlex/inference/models/text_detection/modeling/pp_ocrv5_modules.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, List, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+
+
+def get_bias_attr(k: float) -> ParamAttr:
+    """
+    get_bias_attr
+
+    Args:
+        k (float): Scaling factor for standard deviation calculation
+
+    Returns:
+        ParamAttr: Parameter attribute with uniform initializer
+    """
+    stdv = 1.0 / math.sqrt(k * 1.0)
+    initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
+    bias_attr = ParamAttr(initializer=initializer)
+    return bias_attr
+
+
+class LearnableAffineBlock(nn.Layer):
+    """
+    LearnableAffineBlock
+
+    Args:
+        scale_value (float, optional): Initial value for scale parameter, default is 1.0
+        bias_value (float, optional): Initial value for bias parameter, default is 0.0
+        lr_mult (float, optional): Learning rate multiplier for base learning rate, default is 1.0
+        lab_lr (float, optional): Additional learning rate multiplier for affine parameters, default is 0.01
+
+    Returns:
+        paddle.Tensor: Output tensor after affine transformation (scale * x + bias)
+    """
+
+    def __init__(
+        self,
+        scale_value: float = 1.0,
+        bias_value: float = 0.0,
+        lr_mult: float = 1.0,
+        lab_lr: float = 0.01,
+    ) -> None:
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        return self.scale * x + self.bias
+
+
+class Head(nn.Layer):
+    """
+    Head
+
+    Args:
+        in_channels (int): Number of input channels
+        kernel_list (List[int]): List of kernel sizes for conv/transposed conv layers
+        fix_nan (bool): Whether to fix NaN issues (unused in current implementation)
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        paddle.Tensor: 1-channel sigmoid output tensor (or tuple with feature tensor if return_f=True)
+    """
+
+    def __init__(
+        self, in_channels: int, kernel_list: List[int], fix_nan: bool, **kwargs: Any
+    ) -> None:
+        super(Head, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels // 4,
+            kernel_size=kernel_list[0],
+            padding=int(kernel_list[0] // 2),
+            weight_attr=ParamAttr(),
+            bias_attr=False,
+        )
+        self.conv_bn1 = nn.BatchNorm(
+            num_channels=in_channels // 4,
+            param_attr=ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)),
+            bias_attr=ParamAttr(initializer=paddle.nn.initializer.Constant(value=1e-4)),
+            act="relu",
+        )
+
+        self.conv2 = nn.Conv2DTranspose(
+            in_channels=in_channels // 4,
+            out_channels=in_channels // 4,
+            kernel_size=kernel_list[1],
+            stride=2,
+            weight_attr=ParamAttr(initializer=paddle.nn.initializer.KaimingUniform()),
+            bias_attr=get_bias_attr(in_channels // 4),
+        )
+        self.conv_bn2 = nn.BatchNorm(
+            num_channels=in_channels // 4,
+            param_attr=ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)),
+            bias_attr=ParamAttr(initializer=paddle.nn.initializer.Constant(value=1e-4)),
+            act="relu",
+        )
+        self.conv3 = nn.Conv2DTranspose(
+            in_channels=in_channels // 4,
+            out_channels=1,
+            kernel_size=kernel_list[2],
+            stride=2,
+            weight_attr=ParamAttr(initializer=paddle.nn.initializer.KaimingUniform()),
+            bias_attr=get_bias_attr(in_channels // 4),
+        )
+
+        self.fix_nan = fix_nan
+
+    def forward(
+        self, x: paddle.Tensor, return_f: bool = False
+    ) -> Union[paddle.Tensor, Tuple]:
+        x = self.conv1(x)
+        x = self.conv_bn1(x)
+        x = self.conv2(x)
+        x = self.conv_bn2(x)
+        if return_f is True:
+            f = x
+        x = self.conv3(x)
+        x = F.sigmoid(x)
+        if return_f is True:
+            return x, f
+        return x
+
+
+class DBHead(nn.Layer):
+    """
+    DBHead
+
+    Paper: https://arxiv.org/abs/1911.08947
+
+    Args:
+        in_channels (int): Number of input channels
+        k (int): DB head hyperparameter (kernel factor)
+        **kwargs: Additional keyword arguments for Head class (kernel_list, fix_nan)
+
+    Returns:
+        paddle.Tensor: Shrinkage map tensor after DB binarization
+    """
+
+    def __init__(self, in_channels: int, k: int, **kwargs) -> None:
+        super(DBHead, self).__init__()
+        self.k = k
+        self.binarize = Head(in_channels, **kwargs)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        shrink_maps = self.binarize(x)
+        return shrink_maps
diff --git a/paddlex/inference/models/text_detection/modeling/pp_ocrv5_server_det.py b/paddlex/inference/models/text_detection/modeling/pp_ocrv5_server_det.py
new file mode 100644
index 0000000000..9cfc4a0d4d
--- /dev/null
+++ b/paddlex/inference/models/text_detection/modeling/pp_ocrv5_server_det.py
@@ -0,0 +1,1101 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant, KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ....utils.benchmark import add_inference_operations, benchmark
+from ...common.transformers.transformers import (
+    BatchNormHFStateDictMixin,
+    PretrainedModel,
+)
+from ._config_pp_ocrv5_server import PPOCRV5ServerDetConfig
+from .pp_ocrv5_modules import DBHead, LearnableAffineBlock
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.0)
+ones_ = Constant(value=1.0)
+
+
+class ConvBNAct(nn.Layer):
+    """
+    ConvBNAct: Convolution + Batch Normalization + Activation (optional) with Learnable Affine Block
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int, optional): Convolution kernel size, default is 3
+        stride (int, optional): Convolution stride, default is 1
+        padding (Union[int, str], optional): Padding value or mode (e.g. 'same'), default is 1
+        groups (int, optional): Number of grouped convolution groups, default is 1
+        use_act (bool, optional): Whether to apply ReLU activation, default is True
+        use_lab (bool, optional): Whether to use LearnableAffineBlock after activation, default is False
+        lr_mult (float, optional): Learning rate multiplier for conv/bn parameters, default is 1.0
+
+    Returns:
+        paddle.Tensor: Output tensor after convolution, BN and optional activation/affine transform
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: Union[int, str] = 1,
+        groups: int = 1,
+        use_act: bool = True,
+        use_lab: bool = False,
+        lr_mult: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        padding_val = padding if isinstance(padding, str) else (kernel_size - 1) // 2
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding_val,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+        )
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+        )
+        if self.use_act:
+            self.act = nn.ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Layer):
+    """
+    LightConvBNAct: Lightweight depthwise separable convolution block with BN and activation
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of depthwise convolution
+        use_lab (bool, optional): Whether to use LearnableAffineBlock in ConvBNAct, default is False
+        lr_mult (float, optional): Learning rate multiplier for conv/bn parameters, default is 1.0
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        paddle.Tensor: Output tensor after pointwise conv (no act) + depthwise conv (with act)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        use_lab: bool = False,
+        lr_mult: float = 1.0,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Layer):
+    """
+    StemBlock: Multi-stage convolution stem block with pooling and concatenation
+
+    Args:
+        in_channels (int): Number of input channels
+        mid_channels (int): Number of intermediate channels for stem layers
+        out_channels (int): Number of output channels
+        use_lab (bool, optional): Whether to use LearnableAffineBlock in ConvBNAct, default is False
+        lr_mult (float, optional): Learning rate multiplier for conv/bn parameters, default is 1.0
+
+    Returns:
+        paddle.Tensor: Output tensor after multi-stage convolution, pooling and concatenation
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        mid_channels: int,
+        out_channels: int,
+        use_lab: bool = False,
+        lr_mult: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME"
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HGV2_Block(nn.Layer):
+    """
+    HGV2_Block: Multi-layer convolution block with feature aggregation and residual connection
+
+    Args:
+        in_channels (int): Number of input channels
+        mid_channels (int): Number of intermediate channels for each layer
+        out_channels (int): Number of output channels after aggregation
+        kernel_size (int, optional): Kernel size of convolution layers, default is 3
+        layer_num (int, optional): Number of convolution layers in the block, default is 6
+        identity (bool, optional): Whether to add identity residual connection, default is False
+        light_block (bool, optional): Whether to use LightConvBNAct (True) or ConvBNAct (False), default is True
+        use_lab (bool, optional): Whether to use LearnableAffineBlock in conv layers, default is False
+        lr_mult (float, optional): Learning rate multiplier for conv/bn parameters, default is 1.0
+
+    Returns:
+        paddle.Tensor: Output tensor after multi-layer conv, feature concatenation and aggregation
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        mid_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        layer_num: int = 6,
+        identity: bool = False,
+        light_block: bool = True,
+        use_lab: bool = False,
+        lr_mult: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(
+                    in_channels=in_channels if i == 0 else mid_channels,
+                    out_channels=mid_channels,
+                    stride=1,
+                    kernel_size=kernel_size,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult,
+                )
+            )
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HGV2_Stage(nn.Layer):
+    """
+    HGV2_Stage: Sequential HGV2_Block layers with optional depthwise downsampling
+
+    Args:
+        in_channels (int): Number of input channels
+        mid_channels (int): Number of intermediate channels for HGV2_Block layers
+        out_channels (int): Number of output channels for each HGV2_Block
+        block_num (int): Number of HGV2_Block in the stage
+        layer_num (int, optional): Number of convolution layers in each HGV2_Block, default is 6
+        is_downsample (bool, optional): Whether to apply depthwise downsampling at stage start, default is True
+        light_block (bool, optional): Whether to use LightConvBNAct in HGV2_Block, default is True
+        kernel_size (int, optional): Kernel size of convolution layers in HGV2_Block, default is 3
+        use_lab (bool, optional): Whether to use LearnableAffineBlock in conv layers, default is False
+        stride (int, optional): Stride for downsampling convolution, default is 2
+        lr_mult (float, optional): Learning rate multiplier for conv/bn parameters, default is 1.0
+
+    Returns:
+        paddle.Tensor: Output tensor after optional downsampling and sequential HGV2_Block processing
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        mid_channels: int,
+        out_channels: int,
+        block_num: int,
+        layer_num: int = 6,
+        is_downsample: bool = True,
+        light_block: bool = True,
+        kernel_size: int = 3,
+        use_lab: bool = False,
+        stride: int = 2,
+        lr_mult: float = 1.0,
+    ) -> None:
+
+        super().__init__()
+        self.is_downsample = is_downsample
+        if self.is_downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=stride,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult,
+            )
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HGV2_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.is_downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class PPHGNetV2(nn.Layer):
+    """
+    PPHGNetV2: Hierarchical feature extraction network with stem block and multi-stage HGV2 blocks
+
+    Args:
+        stage_config (Dict[str, Tuple]): Dictionary of stage configurations, each tuple contains (in_channels, mid_channels, out_channels, block_num, is_downsample, light_block, kernel_size, layer_num, stride)
+        stem_channels (Tuple[int, int, int]): Stem block channels (in, mid, out)
+        use_lab (bool): Whether to use LearnableAffineBlock in ConvBNAct layers
+        use_last_conv (bool): Whether to use last convolution layer (unused in current implementation)
+        class_expand (float): Expansion factor for classification head channels
+        class_num (int): Number of classification classes
+        lr_mult_list (List[float]): Learning rate multipliers for stem and each stage
+        det (bool): Whether the network is used for detection (controls output indices)
+        out_indices (List[int]): Indices of stages to output features for detection
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        List[paddle.Tensor]: List of feature tensors from specified stages (only when det=True)
+    """
+
+    def __init__(
+        self,
+        stage_config: Dict[str, Tuple],
+        stem_channels: Tuple[int, int, int],
+        use_lab: bool,
+        use_last_conv: bool,
+        class_expand: float,
+        class_num: int,
+        lr_mult_list: List[float],
+        det: bool,
+        out_indices: List[int],
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.det = det
+        self.use_lab = use_lab
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.out_indices = out_indices
+        self.out_channels = []
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0],
+        )
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            (
+                in_channels,
+                mid_channels,
+                out_channels,
+                block_num,
+                is_downsample,
+                light_block,
+                kernel_size,
+                layer_num,
+                stride,
+            ) = stage_config[k]
+            self.stages.append(
+                HGV2_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    is_downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    stride,
+                    lr_mult=lr_mult_list[i + 1],
+                )
+            )
+            if i in self.out_indices:
+                self.out_channels.append(out_channels)
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
+        x = self.stem(x)
+        out = []
+        for i, stage in enumerate(self.stages):
+            x = stage(x)
+            if self.det and i in self.out_indices:
+                out.append(x)
+        return out
+
+
+class DSConv(nn.Layer):
+    """
+    DSConv: Depthwise separable convolution with bottleneck and residual connection
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of depthwise convolution
+        padding (Union[int, str]): Padding value or mode (e.g. 'SAME') for depthwise conv
+        stride (int, optional): Stride for depthwise convolution, default is 1
+        groups (Optional[int], optional): Number of groups for depthwise conv (default: in_channels)
+        if_act (bool, optional): Whether to apply activation after second BN, default is True
+        act (str, optional): Activation function type ('relu' or 'hardswish'), default is 'relu'
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        paddle.Tensor: Output tensor after depthwise separable convolution and optional residual connection
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        padding: Union[int, str],
+        stride: int = 1,
+        groups: Optional[int] = None,
+        if_act: bool = True,
+        act: str = "relu",
+        **kwargs,
+    ) -> None:
+        super(DSConv, self).__init__()
+        if groups == None:
+            groups = in_channels
+        self.if_act = if_act
+        self.act = act
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False,
+        )
+
+        self.bn1 = nn.BatchNorm(num_channels=in_channels, act=None)
+
+        self.conv2 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=int(in_channels * 4),
+            kernel_size=1,
+            stride=1,
+            bias_attr=False,
+        )
+
+        self.bn2 = nn.BatchNorm(num_channels=int(in_channels * 4), act=None)
+
+        self.conv3 = nn.Conv2D(
+            in_channels=int(in_channels * 4),
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            bias_attr=False,
+        )
+        self._c = [in_channels, out_channels]
+        if in_channels != out_channels:
+            self.conv_end = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False,
+            )
+
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        if self.if_act:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "hardswish":
+                x = F.hardswish(x)
+            else:
+                print(
+                    "The activation function({}) is selected incorrectly.".format(
+                        self.act
+                    )
+                )
+                exit()
+
+        x = self.conv3(x)
+        if self._c[0] != self._c[1]:
+            x = x + self.conv_end(inputs)
+        return x
+
+
+class IntraCLBlock(nn.Layer):
+    """
+    IntraCLBlock: Multi-scale convolution block with vertical/horizontal kernel fusion
+
+    Args:
+        in_channels (int): Number of input channels
+        reduce_factor (int): Channel reduction ratio for 1x1 convolution
+        intraclblock_config (dict): Configuration dict for convolution layers, includes:
+            - reduce_channel: (kernel_size, stride, padding) for channel reduction 1x1 conv
+            - return_channel: (kernel_size, stride, padding) for channel recovery 1x1 conv
+            - v_layer_7x1/5x1/3x1: (kernel_size, stride, padding) for vertical (Hx1) conv
+            - q_layer_1x7/1x5/1x3: (kernel_size, stride, padding) for horizontal (1xW) conv
+            - c_layer_7x7/5x5/3x3: (kernel_size, stride, padding) for cross (HxW) conv
+
+    Returns:
+        paddle.Tensor: Output tensor after multi-scale conv fusion and residual connection
+    """
+
+    def __init__(
+        self, in_channels: int, reduce_factor: int, intraclblock_config: dict
+    ) -> None:
+        super(IntraCLBlock, self).__init__()
+
+        self.channels = in_channels
+        self.reduce_factor = reduce_factor
+        self.intraclblock_config = intraclblock_config
+
+        reduced_ch = self.channels // self.reduce_factor
+
+        self.conv1x1_reduce_channel = nn.Conv2d(
+            self.channels, reduced_ch, *self.intraclblock_config["reduce_channel"]
+        )
+        self.conv1x1_return_channel = nn.Conv2d(
+            reduced_ch, self.channels, *self.intraclblock_config["return_channel"]
+        )
+
+        self.v_layer_7x1 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["v_layer_7x1"]
+        )
+        self.v_layer_5x1 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["v_layer_5x1"]
+        )
+        self.v_layer_3x1 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["v_layer_3x1"]
+        )
+
+        self.q_layer_1x7 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["q_layer_1x7"]
+        )
+        self.q_layer_1x5 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["q_layer_1x5"]
+        )
+        self.q_layer_1x3 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["q_layer_1x3"]
+        )
+
+        self.c_layer_7x7 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["c_layer_7x7"]
+        )
+        self.c_layer_5x5 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["c_layer_5x5"]
+        )
+        self.c_layer_3x3 = nn.Conv2d(
+            reduced_ch, reduced_ch, *self.intraclblock_config["c_layer_3x3"]
+        )
+
+        self.bn = nn.BatchNorm2D(self.channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x_new = self.conv1x1_reduce_channel(x)
+
+        x_7 = (
+            self.c_layer_7x7(x_new) + self.v_layer_7x1(x_new) + self.q_layer_1x7(x_new)
+        )
+        x_5 = self.c_layer_5x5(x_7) + self.v_layer_5x1(x_7) + self.q_layer_1x5(x_7)
+        x_3 = self.c_layer_3x3(x_5) + self.v_layer_3x1(x_5) + self.q_layer_1x3(x_5)
+
+        x_relation = self.conv1x1_return_channel(x_3)
+        x_relation = self.bn(x_relation)
+        x_relation = self.relu(x_relation)
+
+        return x + x_relation
+
+
+class LKPAN(nn.Layer):
+    """
+    LKPAN: Feature pyramid network with multi-scale aggregation and IntraCL enhancement
+
+    Args:
+        in_channels (List[int]): List of input channel numbers for multi-scale feature maps
+        out_channels (int): Number of output channels for 1x1 convolution layers
+        mode (str): Network mode ('lite' for DSConv, 'large' for standard Conv2D)
+        reduce_factor (int): Channel reduction ratio for IntraCLBlock modules
+        intraclblock_config (dict): Configuration dict for convolution layers, includes:
+            - reduce_channel: (kernel_size, stride, padding) for channel reduction 1x1 conv
+            - return_channel: (kernel_size, stride, padding) for channel recovery 1x1 conv
+            - v_layer_7x1/5x1/3x1: (kernel_size, stride, padding) for vertical (Hx1) conv
+            - q_layer_1x7/1x5/1x3: (kernel_size, stride, padding) for horizontal (1xW) conv
+            - c_layer_7x7/5x5/3x3: (kernel_size, stride, padding) for cross (HxW) conv
+        upsample_mode (str): Interpolation mode for upsample operation
+        upsample_align_mode (int): Align mode for upsample operation
+        **kwargs: Additional keyword arguments
+
+    Returns:
+        paddle.Tensor: Fused feature tensor after multi-scale feature aggregation, IntraCLBlock enhancement and concatenation
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        mode: str,
+        reduce_factor: int,
+        intraclblock_config: dict,
+        upsample_mode: str,
+        upsample_align_mode: int,
+        **kwargs,
+    ) -> None:
+        super(LKPAN, self).__init__()
+        self.out_channels = out_channels
+        self.upsample_mode = upsample_mode
+        self.upsample_align_mode = upsample_align_mode
+        weight_attr = nn.initializer.KaimingUniform()
+
+        if mode.lower() == "lite":
+            p_layer = DSConv
+        elif mode.lower() == "large":
+            p_layer = nn.Conv2D
+        else:
+            raise ValueError(
+                "mode can only be one of ['lite', 'large'], but received {}".format(
+                    mode
+                )
+            )
+
+        self.ins_conv = nn.LayerList()
+        self.inp_conv = nn.LayerList()
+        self.pan_head_conv = nn.LayerList()
+        self.pan_lat_conv = nn.LayerList()
+
+        for i in range(len(in_channels)):
+            self.ins_conv.append(
+                nn.Conv2D(
+                    in_channels=in_channels[i],
+                    out_channels=self.out_channels,
+                    kernel_size=1,
+                    weight_attr=ParamAttr(initializer=weight_attr),
+                    bias_attr=False,
+                )
+            )
+
+            self.inp_conv.append(
+                p_layer(
+                    in_channels=self.out_channels,
+                    out_channels=self.out_channels // 4,
+                    kernel_size=9,
+                    padding=4,
+                    weight_attr=ParamAttr(initializer=weight_attr),
+                    bias_attr=False,
+                )
+            )
+
+            if i > 0:
+                self.pan_head_conv.append(
+                    nn.Conv2D(
+                        in_channels=self.out_channels // 4,
+                        out_channels=self.out_channels // 4,
+                        kernel_size=3,
+                        padding=1,
+                        stride=2,
+                        weight_attr=ParamAttr(initializer=weight_attr),
+                        bias_attr=False,
+                    )
+                )
+            self.pan_lat_conv.append(
+                p_layer(
+                    in_channels=self.out_channels // 4,
+                    out_channels=self.out_channels // 4,
+                    kernel_size=9,
+                    padding=4,
+                    weight_attr=ParamAttr(initializer=weight_attr),
+                    bias_attr=False,
+                )
+            )
+
+        self.incl1 = IntraCLBlock(
+            self.out_channels // 4,
+            reduce_factor=reduce_factor,
+            intraclblock_config=intraclblock_config,
+        )
+        self.incl2 = IntraCLBlock(
+            self.out_channels // 4,
+            reduce_factor=reduce_factor,
+            intraclblock_config=intraclblock_config,
+        )
+        self.incl3 = IntraCLBlock(
+            self.out_channels // 4,
+            reduce_factor=reduce_factor,
+            intraclblock_config=intraclblock_config,
+        )
+        self.incl4 = IntraCLBlock(
+            self.out_channels // 4,
+            reduce_factor=reduce_factor,
+            intraclblock_config=intraclblock_config,
+        )
+
+    def forward(self, x: List[paddle.Tensor]) -> paddle.Tensor:
+        c2, c3, c4, c5 = x
+
+        in5 = self.ins_conv[3](c5)
+        in4 = self.ins_conv[2](c4)
+        in3 = self.ins_conv[1](c3)
+        in2 = self.ins_conv[0](c2)
+
+        out4 = in4 + F.upsample(
+            in5,
+            scale_factor=2,
+            mode=self.upsample_mode,
+            align_mode=self.upsample_align_mode,
+        )
+        out3 = in3 + F.upsample(
+            out4,
+            scale_factor=2,
+            mode=self.upsample_mode,
+            align_mode=self.upsample_align_mode,
+        )
+        out2 = in2 + F.upsample(
+            out3,
+            scale_factor=2,
+            mode=self.upsample_mode,
+            align_mode=self.upsample_align_mode,
+        )
+
+        f5 = self.inp_conv[3](in5)
+        f4 = self.inp_conv[2](out4)
+        f3 = self.inp_conv[1](out3)
+        f2 = self.inp_conv[0](out2)
+
+        pan3 = f3 + self.pan_head_conv[0](f2)
+        pan4 = f4 + self.pan_head_conv[1](pan3)
+        pan5 = f5 + self.pan_head_conv[2](pan4)
+
+        p2 = self.pan_lat_conv[0](f2)
+        p3 = self.pan_lat_conv[1](pan3)
+        p4 = self.pan_lat_conv[2](pan4)
+        p5 = self.pan_lat_conv[3](pan5)
+
+        p5 = self.incl4(p5)
+        p4 = self.incl3(p4)
+        p3 = self.incl2(p3)
+        p2 = self.incl1(p2)
+
+        p5 = F.upsample(
+            p5,
+            scale_factor=8,
+            mode=self.upsample_mode,
+            align_mode=self.upsample_align_mode,
+        )
+        p4 = F.upsample(
+            p4,
+            scale_factor=4,
+            mode=self.upsample_mode,
+            align_mode=self.upsample_align_mode,
+        )
+        p3 = F.upsample(
+            p3,
+            scale_factor=2,
+            mode=self.upsample_mode,
+            align_mode=self.upsample_align_mode,
+        )
+
+        fuse = paddle.concat([p5, p4, p3, p2], axis=1)
+        return fuse
+
+
+class ConvBNLayer(nn.Layer):
+    """
+    ConvBNLayer: Basic convolution + batch normalization + optional activation block
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        kernel_size (int): Kernel size of convolution layer
+        stride (int): Convolution stride
+        padding (Union[int, str]): Padding value or mode (e.g. 'SAME') for convolution
+        groups (int, optional): Number of grouped convolution groups, default is 1
+        if_act (bool, optional): Whether to apply activation function, default is True
+        act (Optional[str], optional): Activation function type ('relu' or 'hardswish'), default is None
+
+    Returns:
+        paddle.Tensor: Output tensor after convolution, BN and optional activation
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        padding: Union[int, str],
+        groups: int = 1,
+        if_act: bool = True,
+        act: Optional[str] = None,
+    ) -> None:
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False,
+        )
+
+        self.bn = nn.BatchNorm(num_channels=out_channels, act=None)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "hardswish":
+                x = F.hardswish(x)
+            else:
+                print(
+                    "The activation function({}) is selected incorrectly.".format(
+                        self.act
+                    )
+                )
+                exit()
+        return x
+
+
+class LocalModule(nn.Layer):
+    """
+    LocalModule: Feature enhancement module with concatenation and 1x1 projection
+
+    Args:
+        in_c (int): Number of input channels (before concatenation)
+        mid_c (int): Number of intermediate channels for 3x3 ConvBNLayer
+        act (str): Activation function type for ConvBNLayer
+
+    Returns:
+        paddle.Tensor: 1-channel output tensor after concatenation, conv and projection
+    """
+
+    def __init__(self, in_c: int, mid_c: int, act: str) -> None:
+        super(self.__class__, self).__init__()
+        self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act=act)
+        self.last_1 = nn.Conv2D(mid_c, 1, 1, 1, 0)
+
+    def forward(self, x: paddle.Tensor, init_map: paddle.Tensor) -> paddle.Tensor:
+        outf = paddle.concat([init_map, x], axis=1)
+        out = self.last_1(self.last_3(outf))
+        return out
+
+
+class PFHeadLocal(DBHead):
+    """
+    PFHeadLocal: Enhanced DB head with local feature refinement for detection
+
+    Args:
+        in_channels (int): Number of input channels
+        k (int): DB head hyperparameter (kernel factor)
+        mode (str): Module size mode ('large' or 'small') to control intermediate channels
+        scale_factor (int): Upsampling scale factor for feature maps
+        act (str): Activation function type for LocalModule
+        upsample_mode (str): Interpolation mode for upsample operation
+        upsample_align_mode (int): Align mode for upsample operation
+        **kwargs: Additional keyword arguments for parent DBHead class
+
+    Returns:
+        paddle.Tensor: Fused binarization map (average of base map and enhanced local map)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        k: int,
+        mode: str,
+        scale_factor: int,
+        act: str,
+        upsample_mode: str,
+        upsample_align_mode: int,
+        **kwargs: Any,
+    ) -> None:
+        super(PFHeadLocal, self).__init__(in_channels, k, **kwargs)
+        self.mode = mode
+
+        self.up_conv = nn.Upsample(
+            scale_factor=scale_factor,
+            mode=upsample_mode,
+            align_mode=upsample_align_mode,
+        )
+
+        if mode == "large":
+            mid_ch = in_channels // 4
+        elif mode == "small":
+            mid_ch = in_channels // 8
+        else:
+            raise ValueError(f"mode must be 'large' or 'small', currently {mode}")
+        self.cbn_layer = LocalModule(in_channels // 4, mid_ch, act)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        base_maps, f = self.binarize(x, return_f=True)
+
+        cbn_maps = self.cbn_layer(self.up_conv(f), base_maps)
+        cbn_maps = F.sigmoid(cbn_maps)
+
+        return 0.5 * (base_maps + cbn_maps)
+
+
+class PPOCRV5ServerDet(BatchNormHFStateDictMixin, PretrainedModel):
+    """
+    PPOCRV5ServerDet: Server-side OCR detection model with PPHGNetV2, LKPAN and PFHeadLocal
+
+    Args:
+        config (PPOCRV5ServerDetConfig): Configuration object containing model hyperparameters
+
+    Returns:
+        List: List containing the detection output tensor (converted to numpy array on CPU)
+    """
+
+    config_class = PPOCRV5ServerDetConfig
+
+    def __init__(self, config: PPOCRV5ServerDetConfig) -> None:
+        super().__init__(config)
+
+        self.upsample_mode = config.upsample_mode
+        self.upsample_align_mode = config.upsample_align_mode
+        self.backbone_stem_channels = config.backbone_stem_channels
+        self.backbone_stage_config = config.backbone_stage_config
+        self.backbone_use_lab = config.backbone_use_lab
+        self.backbone_use_last_conv = config.backbone_use_last_conv
+        self.backbone_class_expand = config.backbone_class_expand
+        self.backbone_class_num = config.backbone_class_num
+        self.backbone_lr_mult_list = config.backbone_lr_mult_list
+        self.backbone_det = config.backbone_det
+        self.backbone_out_indices = config.backbone_out_indices
+
+        self.neck_out_channels = config.neck_out_channels
+        self.neck_mode = config.neck_mode
+        self.neck_reduce_factor = config.neck_reduce_factor
+        self.neck_intraclblock_config = config.neck_intraclblock_config
+
+        self.head_in_channels = config.head_in_channels
+        self.head_k = config.head_k
+        self.head_mode = config.head_mode
+        self.head_scale_factor = config.head_scale_factor
+        self.head_act = config.head_act
+        self.head_kernel_list = config.head_kernel_list
+        self.head_fix_nan = config.head_fix_nan
+
+        self.backbone = PPHGNetV2(
+            stage_config=self.backbone_stage_config,
+            stem_channels=self.backbone_stem_channels,
+            use_lab=self.backbone_use_lab,
+            use_last_conv=self.backbone_use_last_conv,
+            class_expand=self.backbone_class_expand,
+            class_num=self.backbone_class_num,
+            lr_mult_list=self.backbone_lr_mult_list,
+            det=self.backbone_det,
+            out_indices=self.backbone_out_indices,
+        )
+
+        neck_in_channels = self.backbone.out_channels
+        self.neck = LKPAN(
+            in_channels=neck_in_channels,
+            out_channels=self.neck_out_channels,
+            mode=self.neck_mode,
+            reduce_factor=self.neck_reduce_factor,
+            intraclblock_config=self.neck_intraclblock_config,
+            upsample_mode=self.upsample_mode,
+            upsample_align_mode=self.upsample_align_mode,
+        )
+
+        head_in_channels = self.neck.out_channels
+        self.head = PFHeadLocal(
+            in_channels=head_in_channels,
+            k=self.head_k,
+            mode=self.head_mode,
+            scale_factor=self.head_scale_factor,
+            act=self.head_act,
+            upsample_mode=self.upsample_mode,
+            upsample_align_mode=self.upsample_align_mode,
+            kernel_list=self.head_kernel_list,
+            fix_nan=self.head_fix_nan,
+        )
+
+    add_inference_operations("pp_ocrv5_server_det_forward")
+
+    @benchmark.timeit_with_options(name="pp_ocrv5_server_det_forward")
+    def forward(self, x: List) -> List:
+
+        x = paddle.to_tensor(x[0])
+
+        x = self.backbone(x)
+        x = self.neck(x)
+        x = self.head(x)
+
+        return [x.cpu().numpy()]
diff --git a/paddlex/inference/models/text_detection/predictor.py b/paddlex/inference/models/text_detection/predictor.py
index 51333075ef..9168353d5e 100644
--- a/paddlex/inference/models/text_detection/predictor.py
+++ b/paddlex/inference/models/text_detection/predictor.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ....modules.text_detection.model_list import MODELS
+from ....utils.device import TemporaryDeviceChanger
 from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
 from ...common.reader import ReadImage
@@ -43,7 +44,7 @@ def __init__(
         input_shape=None,
         max_side_limit: int = 4000,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(*args, **kwargs)
 
@@ -54,6 +55,9 @@ def __init__(
         self.unclip_ratio = unclip_ratio
         self.input_shape = input_shape
         self.max_side_limit = max_side_limit
+
+        self.device = kwargs.get("device", None)
+
         self.pre_tfs, self.infer, self.post_op = self._build()
 
     def _build_batch_sampler(self):
@@ -74,7 +78,29 @@ def _build(self):
                 pre_tfs[name] = op
         pre_tfs["ToBatch"] = ToBatch()
 
-        infer = self.create_static_infer()
+        if self._use_static_model:
+            infer = self.create_static_infer()
+        else:
+            if self.model_name == "PP-OCRv5_mobile_det":
+                from .modeling import PPOCRV5MobileDet
+
+                with TemporaryDeviceChanger(self.device):
+                    infer = PPOCRV5MobileDet.from_pretrained(
+                        self.model_dir, use_safetensors=True, convert_from_hf=True
+                    )
+                infer.eval()
+            elif self.model_name == "PP-OCRv5_server_det":
+                from .modeling import PPOCRV5ServerDet
+
+                with TemporaryDeviceChanger(self.device):
+                    infer = PPOCRV5ServerDet.from_pretrained(
+                        self.model_dir, use_safetensors=True, convert_from_hf=True
+                    )
+                infer.eval()
+            else:
+                raise RuntimeError(
+                    f"There is no dynamic graph implementation for model {repr(self.model_name)}."
+                )
 
         post_op = self.build_postprocess(**self.config["PostProcess"])
         return pre_tfs, infer, post_op
@@ -102,7 +128,12 @@ def process(
         batch_imgs = self.pre_tfs["Normalize"](imgs=batch_imgs)
         batch_imgs = self.pre_tfs["ToCHW"](imgs=batch_imgs)
         x = self.pre_tfs["ToBatch"](imgs=batch_imgs)
-        batch_preds = self.infer(x=x)
+
+        if self._use_static_model:
+            batch_preds = self.infer(x=x)
+        else:
+            with TemporaryDeviceChanger(self.device):
+                batch_preds = self.infer(x=x)
         polys, scores = self.post_op(
             batch_preds,
             batch_shapes,
@@ -128,7 +159,7 @@ def build_resize(
         self,
         limit_side_len: Union[int, None] = None,
         limit_type: Union[str, None] = None,
-        **kwargs
+        **kwargs,
     ):
         # TODO: align to PaddleOCR
 
@@ -150,7 +181,7 @@ def build_resize(
             limit_side_len=limit_side_len,
             limit_type=limit_type,
             input_shape=self.input_shape,
-            **kwargs
+            **kwargs,
         )
 
     @register("NormalizeImage")
diff --git a/paddlex/inference/models/text_detection/processors.py b/paddlex/inference/models/text_detection/processors.py
index d26ee4b839..c43e7df6b4 100644
--- a/paddlex/inference/models/text_detection/processors.py
+++ b/paddlex/inference/models/text_detection/processors.py
@@ -167,7 +167,7 @@ def resize_image_type0(
 
         if max(resize_h, resize_w) > max_side_limit:
             logging.warning(
-                f"Resized image size ({resize_h}x{resize_w}) exceeds max_side_limit of {max_side_limit}. "
+                f"Resized image size ({resize_w}x{resize_h}) exceeds max_side_limit of {max_side_limit}. "
                 f"Resizing to fit within limit."
             )
             ratio = float(max_side_limit) / max(resize_h, resize_w)
diff --git a/paddlex/inference/models/text_recognition/modeling/__init__.py b/paddlex/inference/models/text_recognition/modeling/__init__.py
new file mode 100644
index 0000000000..c9d8cb297a
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pp_ocrv5_rec import PPOCRV5Rec
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py
new file mode 100644
index 0000000000..0c90123e6b
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+
+from ....utils.benchmark import add_inference_operations, benchmark
+from ...common.transformers.transformers import PretrainedConfig, PretrainedModel
+from .pp_ocrv5_rec_modules.rec_lcnetv3 import PPLCNetV3
+from .pp_ocrv5_rec_modules.rec_multi_head import MultiHead
+from .pp_ocrv5_rec_modules.rec_pphgnetv2 import PPHGNetV2
+
+__all__ = ["PPOCRV5Rec"]
+
+
+class PPOCRV5RecConfig(PretrainedConfig):
+    def __init__(
+        self,
+        backbone,
+        MultiHead,
+    ):
+        self.backbone_name = backbone["name"]
+        if self.backbone_name == "PPLCNetV3":
+            self.net_config = backbone["net_config"]
+            self.scale = backbone["scale"]
+            self.conv_kxk_num = backbone["conv_kxk_num"]
+            self.lr_mult_list = backbone["lr_mult_list"]
+            self.lab_lr = backbone["lab_lr"]
+        elif self.backbone_name == "PPHGNetV2":
+            self.text_rec = backbone["text_rec"]
+            self.stem_channels = backbone["stem_channels"]
+            self.stage_config = backbone["stage_config"]
+            self.det = backbone["det"]
+            self.use_lab = backbone["use_lab"]
+            self.use_last_conv = backbone["use_last_conv"]
+            self.class_expand = backbone["class_expand"]
+            self.dropout_prob = backbone["dropout_prob"]
+            self.class_num = backbone["class_num"]
+            self.lr_mult_list = backbone["lr_mult_list"]
+            self.out_indices = backbone["out_indices"]
+        else:
+            raise RuntimeError(
+                f"There is no dynamic graph implementation for backbone {backbone['name']}."
+            )
+        self.head_list = MultiHead["head_list"]
+        self.decode_list = MultiHead["decode_list"]
+        self.tensor_parallel_degree = 1
+
+
+class PPOCRV5Rec(PretrainedModel):
+
+    config_class = PPOCRV5RecConfig
+
+    def __init__(self, config: PPOCRV5RecConfig):
+        super().__init__(config)
+        if self.config.backbone_name == "PPLCNetV3":
+            self.backbone = PPLCNetV3(
+                scale=self.config.scale,
+                net_config=self.config.net_config,
+                conv_kxk_num=self.config.conv_kxk_num,
+                lr_mult_list=self.config.lr_mult_list,
+                lab_lr=self.config.lab_lr,
+            )
+        elif self.config.backbone_name == "PPHGNetV2":
+            self.backbone = PPHGNetV2(
+                stage_config=self.config.stage_config,
+                stem_channels=self.config.stem_channels,
+                text_rec=self.config.text_rec,
+                det=self.config.det,
+                use_lab=self.config.use_lab,
+                use_last_conv=self.config.use_last_conv,
+                class_expand=self.config.class_expand,
+                dropout_prob=self.config.dropout_prob,
+                class_num=self.config.class_num,
+                lr_mult_list=self.config.lr_mult_list,
+                out_indices=self.config.out_indices,
+            )
+        self.head = MultiHead(
+            in_channels=self.backbone.out_channels,
+            out_channels_list=self.config.decode_list,
+            head_list=self.config.head_list,
+        )
+
+    add_inference_operations("pp_ocrv5_rec_forward")
+
+    @benchmark.timeit_with_options(name="pp_ocrv5_rec_forward")
+    def forward(self, x):
+        x = paddle.to_tensor(x[0])
+        x = self.backbone(x)
+        x = self.head(x)
+        return [x.cpu().numpy()]
+
+    def get_transpose_weight_keys(self):
+        transpose_keys = ["fc", "out_proj", "attn.qkv"]
+        need_to_transpose = []
+        all_weight_keys = []
+        for name, param in self.head.named_parameters():
+            all_weight_keys.append("head." + name)
+        for i in range(len(all_weight_keys)):
+            for j in range(len(transpose_keys)):
+                if (transpose_keys[j] in all_weight_keys[i]) and (
+                    "bias" not in all_weight_keys[i]
+                ):
+                    need_to_transpose.append(all_weight_keys[i])
+        if self.config.backbone_name == "PPHGNetV2":
+            need_to_transpose.append("backbone.fc.weight")
+        return need_to_transpose
+
+    def get_hf_state_dict(self, *args, **kwargs):
+
+        model_state_dict = self.state_dict(*args, **kwargs)
+
+        hf_state_dict = {}
+        for old_key, value in model_state_dict.items():
+            if "_mean" in old_key:
+                new_key = old_key.replace("_mean", "running_mean")
+            elif "_variance" in old_key:
+                new_key = old_key.replace("_variance", "running_var")
+            else:
+                new_key = old_key
+            hf_state_dict[new_key] = value
+
+        return hf_state_dict
+
+    def set_hf_state_dict(self, state_dict, *args, **kwargs):
+
+        key_mapping = {}
+        for old_key in list(state_dict.keys()):
+            if "running_mean" in old_key:
+                key_mapping[old_key] = old_key.replace("running_mean", "_mean")
+            elif "running_var" in old_key:
+                key_mapping[old_key] = old_key.replace("running_var", "_variance")
+
+        for old_key, new_key in key_mapping.items():
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        return self.set_state_dict(state_dict, *args, **kwargs)
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_ctc_head.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_ctc_head.py
new file mode 100755
index 0000000000..7f0f1c00f5
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_ctc_head.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import math
+
+import paddle
+from paddle import ParamAttr, nn
+from paddle.nn import functional as F
+
+
+def get_para_bias_attr(l2_decay, k):
+    regularizer = paddle.regularizer.L2Decay(l2_decay)
+    stdv = 1.0 / math.sqrt(k * 1.0)
+    initializer = nn.initializer.Uniform(-stdv, stdv)
+    weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer)
+    bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer)
+    return [weight_attr, bias_attr]
+
+
+class CTCHead(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        fc_decay=0.0004,
+        mid_channels=None,
+        return_feats=False,
+        **kwargs,
+    ):
+        super(CTCHead, self).__init__()
+        if mid_channels is None:
+            weight_attr, bias_attr = get_para_bias_attr(
+                l2_decay=fc_decay, k=in_channels
+            )
+            self.fc = nn.Linear(
+                in_channels, out_channels, weight_attr=weight_attr, bias_attr=bias_attr
+            )
+        else:
+            weight_attr1, bias_attr1 = get_para_bias_attr(
+                l2_decay=fc_decay, k=in_channels
+            )
+            self.fc1 = nn.Linear(
+                in_channels,
+                mid_channels,
+                weight_attr=weight_attr1,
+                bias_attr=bias_attr1,
+            )
+
+            weight_attr2, bias_attr2 = get_para_bias_attr(
+                l2_decay=fc_decay, k=mid_channels
+            )
+            self.fc2 = nn.Linear(
+                mid_channels,
+                out_channels,
+                weight_attr=weight_attr2,
+                bias_attr=bias_attr2,
+            )
+        self.out_channels = out_channels
+        self.mid_channels = mid_channels
+        self.return_feats = return_feats
+
+    def forward(self, x, targets=None):
+        if self.mid_channels is None:
+            predicts = self.fc(x)
+        else:
+            x = self.fc1(x)
+            predicts = self.fc2(x)
+
+        if self.return_feats:
+            result = (x, predicts)
+        else:
+            result = predicts
+        if not self.training:
+            predicts = F.softmax(predicts, axis=2)
+            result = predicts
+
+        return result
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_lcnetv3.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_lcnetv3.py
new file mode 100644
index 0000000000..73a4eb2916
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_lcnetv3.py
@@ -0,0 +1,491 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import (
+    AdaptiveAvgPool2D,
+    BatchNorm2D,
+    Conv2D,
+    Hardsigmoid,
+    Hardswish,
+    ReLU,
+)
+from paddle.nn.initializer import Constant, KaimingNormal
+from paddle.regularizer import L2Decay
+
+
+def make_divisible(v, divisor=16, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MeanPool2D(nn.Layer):
+    def __init__(self, w, h):
+        super().__init__()
+        self.w = w
+        self.h = h
+
+    def forward(self, feat):
+        batch_size, channels, _, _ = feat.shape
+        feat_flat = paddle.reshape(feat, [batch_size, channels, -1])
+        feat_mean = paddle.mean(feat_flat, axis=2)
+        feat_mean = paddle.reshape(feat_mean, [batch_size, channels, self.w, self.h])
+        return feat_mean
+
+
+class LearnableAffineBlock(nn.Layer):
+    def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.1):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, groups=1, lr_mult=1.0
+    ):
+        super().__init__()
+        self.conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal(), learning_rate=lr_mult),
+            bias_attr=False,
+        )
+
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class Act(nn.Layer):
+    def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1):
+        super().__init__()
+        if act == "hswish":
+            self.act = Hardswish()
+        else:
+            assert act == "relu"
+            self.act = ReLU()
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+
+    def forward(self, x):
+        return self.lab(self.act(x))
+
+
+class LearnableRepLayer(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        groups=1,
+        num_conv_branches=1,
+        lr_mult=1.0,
+        lab_lr=0.1,
+    ):
+        super().__init__()
+        self.is_repped = False
+        self.groups = groups
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_conv_branches = num_conv_branches
+        self.padding = (kernel_size - 1) // 2
+
+        self.identity = (
+            BatchNorm2D(
+                num_features=in_channels,
+                weight_attr=ParamAttr(learning_rate=lr_mult),
+                bias_attr=ParamAttr(learning_rate=lr_mult),
+            )
+            if out_channels == in_channels and stride == 1
+            else None
+        )
+
+        self.conv_kxk = nn.LayerList(
+            [
+                ConvBNLayer(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    lr_mult=lr_mult,
+                )
+                for _ in range(self.num_conv_branches)
+            ]
+        )
+
+        self.conv_1x1 = (
+            ConvBNLayer(
+                in_channels, out_channels, 1, stride, groups=groups, lr_mult=lr_mult
+            )
+            if kernel_size > 1
+            else None
+        )
+
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+        self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr)
+
+    def forward(self, x):
+        # for export
+        if self.is_repped:
+            out = self.lab(self.reparam_conv(x))
+            if self.stride != 2:
+                out = self.act(out)
+            return out
+
+        out = 0
+        if self.identity is not None:
+            out += self.identity(x)
+
+        if self.conv_1x1 is not None:
+            out += self.conv_1x1(x)
+
+        for conv in self.conv_kxk:
+            out += conv(x)
+
+        out = self.lab(out)
+        if self.stride != 2:
+            out = self.act(out)
+        return out
+
+    def rep(self):
+        if self.is_repped:
+            return
+        kernel, bias = self._get_kernel_bias()
+        self.reparam_conv = Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            groups=self.groups,
+        )
+        self.reparam_conv.weight.set_value(kernel)
+        self.reparam_conv.bias.set_value(bias)
+        self.is_repped = True
+
+    def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad):
+        if not isinstance(kernel1x1, paddle.Tensor):
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [pad, pad, pad, pad])
+
+    def _get_kernel_bias(self):
+        kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1)
+        kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(
+            kernel_conv_1x1, self.kernel_size // 2
+        )
+
+        kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity)
+
+        kernel_conv_kxk = 0
+        bias_conv_kxk = 0
+        for conv in self.conv_kxk:
+            kernel, bias = self._fuse_bn_tensor(conv)
+            kernel_conv_kxk += kernel
+            bias_conv_kxk += bias
+
+        kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity
+        bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity
+        return kernel_reparam, bias_reparam
+
+    def _fuse_bn_tensor(self, branch):
+        if not branch:
+            return 0, 0
+        elif isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn._mean
+            running_var = branch.bn._variance
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn._epsilon
+        else:
+            assert isinstance(branch, BatchNorm2D)
+            if not hasattr(self, "id_tensor"):
+                input_dim = self.in_channels // self.groups
+                kernel_value = paddle.zeros(
+                    (self.in_channels, input_dim, self.kernel_size, self.kernel_size),
+                    dtype=branch.weight.dtype,
+                )
+                for i in range(self.in_channels):
+                    kernel_value[
+                        i, i % input_dim, self.kernel_size // 2, self.kernel_size // 2
+                    ] = 1
+                self.id_tensor = kernel_value
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class SELayer(nn.Layer):
+    def __init__(self, channel, reduction=4, lr_mult=1.0):
+        super().__init__()
+        if "npu" in paddle.device.get_device():
+            self.avg_pool = MeanPool2D(1, 1)
+        else:
+            self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
+        self.relu = ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+        )
+        self.hardsigmoid = Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class LCNetV3Block(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        dw_size,
+        use_se=False,
+        conv_kxk_num=4,
+        lr_mult=1.0,
+        lab_lr=0.1,
+    ):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=dw_size,
+            stride=stride,
+            groups=in_channels,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr,
+        )
+        if use_se:
+            self.se = SELayer(in_channels, lr_mult=lr_mult)
+        self.pw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr,
+        )
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class PPLCNetV3(nn.Layer):
+    def __init__(
+        self,
+        net_config,
+        scale=1.0,
+        conv_kxk_num=4,
+        lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        lab_lr=0.1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.scale = scale
+        self.lr_mult_list = lr_mult_list
+        self.net_config = net_config
+
+        assert isinstance(
+            self.lr_mult_list, (list, tuple)
+        ), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list)
+        )
+        assert (
+            len(self.lr_mult_list) == 6
+        ), "lr_mult_list length should be 6 but got {}".format(len(self.lr_mult_list))
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=make_divisible(16 * scale),
+            kernel_size=3,
+            stride=2,
+            lr_mult=self.lr_mult_list[0],
+        )
+
+        self.blocks2 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[1],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks2"])
+            ]
+        )
+
+        self.blocks3 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[2],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks3"])
+            ]
+        )
+
+        self.blocks4 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[3],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks4"])
+            ]
+        )
+
+        self.blocks5 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[4],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks5"])
+            ]
+        )
+
+        self.blocks6 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[5],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks6"])
+            ]
+        )
+        self.out_channels = make_divisible(512 * scale)
+
+    def forward(self, x):
+        out_list = []
+        x = self.conv1(x)
+
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        out_list.append(x)
+        x = self.blocks4(x)
+        out_list.append(x)
+        x = self.blocks5(x)
+        out_list.append(x)
+        x = self.blocks6(x)
+        out_list.append(x)
+
+        if self.training:
+            x = F.adaptive_avg_pool2d(x, [1, 40])
+        else:
+            x = F.avg_pool2d(x, [3, 2])
+        return x
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_multi_head.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_multi_head.py
new file mode 100644
index 0000000000..4cf2a22414
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_multi_head.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle.nn as nn
+from .rec_ctc_head import CTCHead
+
+from .rec_nrtr_head import Transformer
+from .rnn import (
+    Im2Seq,
+    SequenceEncoder,
+    trunc_normal_,
+    zeros_,
+)
+
+
+class FCTranspose(nn.Layer):
+    def __init__(self, in_channels, out_channels, only_transpose=False):
+        super().__init__()
+        self.only_transpose = only_transpose
+        if not self.only_transpose:
+            self.fc = nn.Linear(in_channels, out_channels, bias_attr=False)
+
+    def forward(self, x):
+        if self.only_transpose:
+            return x.transpose([0, 2, 1])
+        else:
+            return self.fc(x.transpose([0, 2, 1]))
+
+
+class AddPos(nn.Layer):
+    def __init__(self, dim, w):
+        super().__init__()
+        self.dec_pos_embed = self.create_parameter(
+            shape=[1, w, dim], default_initializer=zeros_
+        )
+        self.add_parameter("dec_pos_embed", self.dec_pos_embed)
+        trunc_normal_(self.dec_pos_embed)
+
+    def forward(self, x):
+        x = x + self.dec_pos_embed[:, : x.shape[1], :]
+        return x
+
+
+class MultiHead(nn.Layer):
+    def __init__(self, in_channels, out_channels_list, **kwargs):
+        super().__init__()
+        self.head_list = kwargs.pop("head_list")
+        self.use_pool = kwargs.get("use_pool", False)
+        self.use_pos = kwargs.get("use_pos", False)
+        self.in_channels = in_channels
+        if self.use_pool:
+            self.pool = nn.AvgPool2D(kernel_size=[3, 2], stride=[3, 2], padding=0)
+        self.gtc_head = "sar"
+        assert len(self.head_list) >= 2
+        for idx, head_name in enumerate(self.head_list):
+            name = list(head_name)[0]
+            if name == "NRTRHead":
+                gtc_args = self.head_list[idx][name]
+                max_text_length = gtc_args.get("max_text_length", 25)
+                nrtr_dim = gtc_args.get("nrtr_dim", 256)
+                num_decoder_layers = gtc_args.get("num_decoder_layers", 4)
+                if self.use_pos:
+                    self.before_gtc = nn.Sequential(
+                        nn.Flatten(2),
+                        FCTranspose(in_channels, nrtr_dim),
+                        AddPos(nrtr_dim, 80),
+                    )
+                else:
+                    self.before_gtc = nn.Sequential(
+                        nn.Flatten(2), FCTranspose(in_channels, nrtr_dim)
+                    )
+
+                self.gtc_head = Transformer(
+                    d_model=nrtr_dim,
+                    nhead=nrtr_dim // 32,
+                    num_encoder_layers=-1,
+                    beam_size=-1,
+                    num_decoder_layers=num_decoder_layers,
+                    max_len=max_text_length,
+                    dim_feedforward=nrtr_dim * 4,
+                    out_channels=out_channels_list["NRTRLabelDecode"],
+                )
+            elif name == "CTCHead":
+                # ctc neck
+                self.encoder_reshape = Im2Seq(in_channels)
+                neck_args = self.head_list[idx][name]["Neck"]
+                encoder_type = neck_args.pop("name")
+                self.ctc_encoder = SequenceEncoder(
+                    in_channels=in_channels, encoder_type=encoder_type, **neck_args
+                )
+                # ctc head
+                head_args = self.head_list[idx][name]["Head"]
+                self.ctc_head = eval(name)(
+                    in_channels=self.ctc_encoder.out_channels,
+                    out_channels=out_channels_list["CTCLabelDecode"],
+                    **head_args,
+                )
+            else:
+                raise NotImplementedError(
+                    "{} is not supported in MultiHead yet".format(name)
+                )
+
+    def forward(self, x, targets=None):
+        if self.use_pool:
+            x = self.pool(
+                x.reshape([0, 3, -1, self.in_channels]).transpose([0, 3, 1, 2])
+            )
+        ctc_encoder = self.ctc_encoder(x)
+        ctc_out = self.ctc_head(ctc_encoder, targets)
+        head_out = dict()
+        head_out["ctc"] = ctc_out
+        head_out["ctc_neck"] = ctc_encoder
+        # eval mode
+        if not self.training:
+            return ctc_out
+        if self.gtc_head == "sar":
+            sar_out = self.sar_head(x, targets[1:])
+            head_out["sar"] = sar_out
+        else:
+            gtc_out = self.gtc_head(self.before_gtc(x), targets[1:])
+            head_out["gtc"] = gtc_out
+        return head_out
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_nrtr_head.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_nrtr_head.py
new file mode 100644
index 0000000000..dc4670beb2
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_nrtr_head.py
@@ -0,0 +1,735 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.nn import Dropout, LayerNorm
+from paddle.nn.initializer import Constant
+from paddle.nn.initializer import XavierNormal as xavier_normal_
+
+zeros_ = Constant(value=0.0)
+
+
+class Mlp(nn.Layer):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Transformer(nn.Layer):
+    """A transformer model. User is able to modify the attributes as needed. The architecture
+    is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+    Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+    Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+    Processing Systems, pages 6000-6010.
+
+    Args:
+        d_model: the number of expected features in the encoder/decoder inputs (default=512).
+        nhead: the number of heads in the multiheadattention models (default=8).
+        num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+        num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        custom_encoder: custom encoder (default=None).
+        custom_decoder: custom decoder (default=None).
+    """
+
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        beam_size=0,
+        num_decoder_layers=6,
+        max_len=25,
+        dim_feedforward=1024,
+        attention_dropout_rate=0.0,
+        residual_dropout_rate=0.1,
+        in_channels=0,
+        out_channels=0,
+        scale_embedding=True,
+    ):
+        super(Transformer, self).__init__()
+        self.out_channels = out_channels + 1
+        self.max_len = max_len
+        self.embedding = Embeddings(
+            d_model=d_model,
+            vocab=self.out_channels,
+            padding_idx=0,
+            scale_embedding=scale_embedding,
+        )
+        self.positional_encoding = PositionalEncoding(
+            dropout=residual_dropout_rate, dim=d_model
+        )
+
+        if num_encoder_layers > 0:
+            self.encoder = nn.LayerList(
+                [
+                    TransformerBlock(
+                        d_model,
+                        nhead,
+                        dim_feedforward,
+                        attention_dropout_rate,
+                        residual_dropout_rate,
+                        with_self_attn=True,
+                        with_cross_attn=False,
+                    )
+                    for i in range(num_encoder_layers)
+                ]
+            )
+        else:
+            self.encoder = None
+
+        self.decoder = nn.LayerList(
+            [
+                TransformerBlock(
+                    d_model,
+                    nhead,
+                    dim_feedforward,
+                    attention_dropout_rate,
+                    residual_dropout_rate,
+                    with_self_attn=True,
+                    with_cross_attn=True,
+                )
+                for i in range(num_decoder_layers)
+            ]
+        )
+
+        self.beam_size = beam_size
+        self.d_model = d_model
+        self.nhead = nhead
+        self.tgt_word_prj = nn.Linear(d_model, self.out_channels, bias_attr=False)
+        w0 = np.random.normal(
+            0.0, d_model**-0.5, (d_model, self.out_channels)
+        ).astype(np.float32)
+        self.tgt_word_prj.weight.set_value(w0)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            xavier_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward_train(self, src, tgt):
+        tgt = tgt[:, :-1]
+
+        tgt = self.embedding(tgt)
+        tgt = self.positional_encoding(tgt)
+        tgt_mask = self.generate_square_subsequent_mask(tgt.shape[1])
+
+        if self.encoder is not None:
+            src = self.positional_encoding(src)
+            for encoder_layer in self.encoder:
+                src = encoder_layer(src)
+            memory = src  # B N C
+        else:
+            memory = src  # B N C
+        for decoder_layer in self.decoder:
+            tgt = decoder_layer(tgt, memory, self_mask=tgt_mask)
+        output = tgt
+        logit = self.tgt_word_prj(output)
+        return logit
+
+    def forward(self, src, targets=None):
+        """Take in and process masked source/target sequences.
+        Args:
+            src: the sequence to the encoder (required).
+            tgt: the sequence to the decoder (required).
+        Shape:
+            - src: :math:`(B, sN, C)`.
+            - tgt: :math:`(B, tN, C)`.
+        Examples:
+            >>> output = transformer_model(src, tgt)
+        """
+
+        if self.training:
+            max_len = targets[1].max()
+            tgt = targets[0][:, : 2 + max_len]
+            return self.forward_train(src, tgt)
+        else:
+            if self.beam_size > 0:
+                return self.forward_beam(src)
+            else:
+                return self.forward_test(src)
+
+    def forward_test(self, src):
+        bs = src.shape[0]
+        if self.encoder is not None:
+            src = self.positional_encoding(src)
+            for encoder_layer in self.encoder:
+                src = encoder_layer(src)
+            memory = src  # B N C
+        else:
+            memory = src
+        dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64)
+        dec_prob = paddle.full((bs, 1), 1.0, dtype=paddle.float32)
+        for len_dec_seq in range(1, paddle.to_tensor(self.max_len)):
+            dec_seq_embed = self.embedding(dec_seq)
+            dec_seq_embed = self.positional_encoding(dec_seq_embed)
+            tgt_mask = self.generate_square_subsequent_mask(dec_seq_embed.shape[1])
+            tgt = dec_seq_embed
+            for decoder_layer in self.decoder:
+                tgt = decoder_layer(tgt, memory, self_mask=tgt_mask)
+            dec_output = tgt
+            dec_output = dec_output[:, -1, :]
+            word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=-1)
+            preds_idx = paddle.argmax(word_prob, axis=-1)
+            if paddle.equal_all(
+                preds_idx, paddle.full(preds_idx.shape, 3, dtype="int64")
+            ):
+                break
+            preds_prob = paddle.max(word_prob, axis=-1)
+            dec_seq = paddle.concat(
+                [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1
+            )
+            dec_prob = paddle.concat(
+                [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1
+            )
+        return [dec_seq, dec_prob]
+
+    def forward_beam(self, images):
+        """Translation work in one batch"""
+
+        def get_inst_idx_to_tensor_position_map(inst_idx_list):
+            """Indicate the position of an instance in a tensor."""
+            return {
+                inst_idx: tensor_position
+                for tensor_position, inst_idx in enumerate(inst_idx_list)
+            }
+
+        def collect_active_part(
+            beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm
+        ):
+            """Collect tensor parts associated to active instances."""
+
+            beamed_tensor_shape = beamed_tensor.shape
+            n_curr_active_inst = len(curr_active_inst_idx)
+            new_shape = (
+                n_curr_active_inst * n_bm,
+                beamed_tensor_shape[1],
+                beamed_tensor_shape[2],
+            )
+
+            beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1])
+            beamed_tensor = beamed_tensor.index_select(curr_active_inst_idx, axis=0)
+            beamed_tensor = beamed_tensor.reshape(new_shape)
+
+            return beamed_tensor
+
+        def collate_active_info(
+            src_enc, inst_idx_to_position_map, active_inst_idx_list
+        ):
+            # Sentences which are still active are collected,
+            # so the decoder will not run on completed sentences.
+
+            n_prev_active_inst = len(inst_idx_to_position_map)
+            active_inst_idx = [
+                inst_idx_to_position_map[k] for k in active_inst_idx_list
+            ]
+            active_inst_idx = paddle.to_tensor(active_inst_idx, dtype="int64")
+            active_src_enc = collect_active_part(
+                src_enc.transpose([1, 0, 2]), active_inst_idx, n_prev_active_inst, n_bm
+            ).transpose([1, 0, 2])
+            active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
+                active_inst_idx_list
+            )
+            return active_src_enc, active_inst_idx_to_position_map
+
+        def beam_decode_step(
+            inst_dec_beams, len_dec_seq, enc_output, inst_idx_to_position_map, n_bm
+        ):
+            """Decode and update beam status, and then return active beam idx"""
+
+            def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
+                dec_partial_seq = [
+                    b.get_current_state() for b in inst_dec_beams if not b.done
+                ]
+                dec_partial_seq = paddle.stack(dec_partial_seq)
+                dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq])
+                return dec_partial_seq
+
+            def predict_word(dec_seq, enc_output, n_active_inst, n_bm):
+                dec_seq = self.embedding(dec_seq)
+                dec_seq = self.positional_encoding(dec_seq)
+                tgt_mask = self.generate_square_subsequent_mask(dec_seq.shape[1])
+                tgt = dec_seq
+                for decoder_layer in self.decoder:
+                    tgt = decoder_layer(tgt, enc_output, self_mask=tgt_mask)
+                dec_output = tgt
+                dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
+                word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
+                word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1])
+                return word_prob
+
+            def collect_active_inst_idx_list(
+                inst_beams, word_prob, inst_idx_to_position_map
+            ):
+                active_inst_idx_list = []
+                for inst_idx, inst_position in inst_idx_to_position_map.items():
+                    is_inst_complete = inst_beams[inst_idx].advance(
+                        word_prob[inst_position]
+                    )
+                    if not is_inst_complete:
+                        active_inst_idx_list += [inst_idx]
+
+                return active_inst_idx_list
+
+            n_active_inst = len(inst_idx_to_position_map)
+            dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
+            word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm)
+            # Update the beam with predicted word prob information and collect incomplete instances
+            active_inst_idx_list = collect_active_inst_idx_list(
+                inst_dec_beams, word_prob, inst_idx_to_position_map
+            )
+            return active_inst_idx_list
+
+        def collect_hypothesis_and_scores(inst_dec_beams, n_best):
+            all_hyp, all_scores = [], []
+            for inst_idx in range(len(inst_dec_beams)):
+                scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
+                all_scores += [scores[:n_best]]
+                hyps = [
+                    inst_dec_beams[inst_idx].get_hypothesis(i)
+                    for i in tail_idxs[:n_best]
+                ]
+                all_hyp += [hyps]
+            return all_hyp, all_scores
+
+        with paddle.no_grad():
+            # -- Encode
+            if self.encoder is not None:
+                src = self.positional_encoding(images)
+                src_enc = self.encoder(src)
+            else:
+                src_enc = images
+
+            n_bm = self.beam_size
+            src_shape = src_enc.shape
+            inst_dec_beams = [Beam(n_bm) for _ in range(1)]
+            active_inst_idx_list = list(range(1))
+            # Repeat data for beam search
+            src_enc = paddle.tile(src_enc, [1, n_bm, 1])
+            inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
+                active_inst_idx_list
+            )
+            # Decode
+            for len_dec_seq in range(1, paddle.to_tensor(self.max_len)):
+                src_enc_copy = src_enc.clone()
+                active_inst_idx_list = beam_decode_step(
+                    inst_dec_beams,
+                    len_dec_seq,
+                    src_enc_copy,
+                    inst_idx_to_position_map,
+                    n_bm,
+                )
+                if not active_inst_idx_list:
+                    break  # all instances have finished their path to <EOS>
+                src_enc, inst_idx_to_position_map = collate_active_info(
+                    src_enc_copy, inst_idx_to_position_map, active_inst_idx_list
+                )
+        batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1)
+        result_hyp = []
+        hyp_scores = []
+        for bs_hyp, score in zip(batch_hyp, batch_scores):
+            l = len(bs_hyp[0])
+            bs_hyp_pad = bs_hyp[0] + [3] * (25 - l)
+            result_hyp.append(bs_hyp_pad)
+            score = float(score) / l
+            hyp_score = [score for _ in range(25)]
+            hyp_scores.append(hyp_score)
+        return [
+            paddle.to_tensor(np.array(result_hyp), dtype=paddle.int64),
+            paddle.to_tensor(hyp_scores),
+        ]
+
+    def generate_square_subsequent_mask(self, sz):
+        """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
+        Unmasked positions are filled with float(0.0).
+        """
+        mask = paddle.zeros([sz, sz], dtype="float32")
+        mask_inf = paddle.triu(
+            paddle.full(shape=[sz, sz], dtype="float32", fill_value=float("-inf")),
+            diagonal=1,
+        )
+        mask = mask + mask_inf
+        return mask.unsqueeze([0, 1])
+
+
+class MultiheadAttention(nn.Layer):
+    """Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+
+    Args:
+        embed_dim: total dimension of the model
+        num_heads: parallel attention layers, or heads
+
+    """
+
+    def __init__(self, embed_dim, num_heads, dropout=0.0, self_attn=False):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        # self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scale = self.head_dim**-0.5
+        self.self_attn = self_attn
+        if self_attn:
+            self.qkv = nn.Linear(embed_dim, embed_dim * 3)
+        else:
+            self.q = nn.Linear(embed_dim, embed_dim)
+            self.kv = nn.Linear(embed_dim, embed_dim * 2)
+        self.attn_drop = nn.Dropout(dropout)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+    def forward(self, query, key=None, attn_mask=None):
+        qN = query.shape[1]
+
+        if self.self_attn:
+            qkv = (
+                self.qkv(query)
+                .reshape((0, qN, 3, self.num_heads, self.head_dim))
+                .transpose((2, 0, 3, 1, 4))
+            )
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        else:
+            kN = key.shape[1]
+            q = (
+                self.q(query)
+                .reshape([0, qN, self.num_heads, self.head_dim])
+                .transpose([0, 2, 1, 3])
+            )
+            kv = (
+                self.kv(key)
+                .reshape((0, kN, 2, self.num_heads, self.head_dim))
+                .transpose((2, 0, 3, 1, 4))
+            )
+            k, v = kv[0], kv[1]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+
+        if attn_mask is not None:
+            attn += attn_mask
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, qN, self.embed_dim))
+        x = self.out_proj(x)
+
+        return x
+
+
+class TransformerBlock(nn.Layer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        attention_dropout_rate=0.0,
+        residual_dropout_rate=0.1,
+        with_self_attn=True,
+        with_cross_attn=False,
+        epsilon=1e-5,
+    ):
+        super(TransformerBlock, self).__init__()
+        self.with_self_attn = with_self_attn
+        if with_self_attn:
+            self.self_attn = MultiheadAttention(
+                d_model, nhead, dropout=attention_dropout_rate, self_attn=with_self_attn
+            )
+            self.norm1 = LayerNorm(d_model, epsilon=epsilon)
+            self.dropout1 = Dropout(residual_dropout_rate)
+        self.with_cross_attn = with_cross_attn
+        if with_cross_attn:
+            self.cross_attn = (
+                MultiheadAttention(  # for self_attn of encoder or cross_attn of decoder
+                    d_model, nhead, dropout=attention_dropout_rate
+                )
+            )
+            self.norm2 = LayerNorm(d_model, epsilon=epsilon)
+            self.dropout2 = Dropout(residual_dropout_rate)
+
+        self.mlp = Mlp(
+            in_features=d_model,
+            hidden_features=dim_feedforward,
+            act_layer=nn.ReLU,
+            drop=residual_dropout_rate,
+        )
+
+        self.norm3 = LayerNorm(d_model, epsilon=epsilon)
+
+        self.dropout3 = Dropout(residual_dropout_rate)
+
+    def forward(self, tgt, memory=None, self_mask=None, cross_mask=None):
+        if self.with_self_attn:
+            tgt1 = self.self_attn(tgt, attn_mask=self_mask)
+            tgt = self.norm1(tgt + self.dropout1(tgt1))
+
+        if self.with_cross_attn:
+            tgt2 = self.cross_attn(tgt, key=memory, attn_mask=cross_mask)
+            tgt = self.norm2(tgt + self.dropout2(tgt2))
+        tgt = self.norm3(tgt + self.dropout3(self.mlp(tgt)))
+        return tgt
+
+
+class PositionalEncoding(nn.Layer):
+    """Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, dropout, dim, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = paddle.zeros([max_len, dim])
+        position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, dim, 2).astype("float32") * (-math.log(10000.0) / dim)
+        )
+        pe[:, 0::2] = paddle.sin(position * div_term)
+        pe[:, 1::2] = paddle.cos(position * div_term)
+        pe = paddle.unsqueeze(pe, 0)
+        pe = paddle.transpose(pe, [1, 0, 2])
+        pe = pe.contiguous()
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        """Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        x = x.transpose([1, 0, 2])
+        x = x + self.pe[: x.shape[0], :]
+        return self.dropout(x).transpose([1, 0, 2])
+
+
+class PositionalEncoding_2d(nn.Layer):
+    """Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, dropout, dim, max_len=5000):
+        super(PositionalEncoding_2d, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = paddle.zeros([max_len, dim])
+        position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, dim, 2).astype("float32") * (-math.log(10000.0) / dim)
+        )
+        pe[:, 0::2] = paddle.sin(position * div_term)
+        pe[:, 1::2] = paddle.cos(position * div_term)
+        pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2])
+        self.register_buffer("pe", pe)
+
+        self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1))
+        self.linear1 = nn.Linear(dim, dim)
+        self.linear1.weight.data.fill_(1.0)
+        self.avg_pool_2 = nn.AdaptiveAvgPool2D((1, 1))
+        self.linear2 = nn.Linear(dim, dim)
+        self.linear2.weight.data.fill_(1.0)
+
+    def forward(self, x):
+        """Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        w_pe = self.pe[: x.shape[-1], :]
+        w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0)
+        w_pe = w_pe * w1
+        w_pe = paddle.transpose(w_pe, [1, 2, 0])
+        w_pe = paddle.unsqueeze(w_pe, 2)
+
+        h_pe = self.pe[: x.shape.shape[-2], :]
+        w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0)
+        h_pe = h_pe * w2
+        h_pe = paddle.transpose(h_pe, [1, 2, 0])
+        h_pe = paddle.unsqueeze(h_pe, 3)
+
+        x = x + w_pe + h_pe
+        x = paddle.transpose(
+            paddle.reshape(x, [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]),
+            [2, 0, 1],
+        )
+
+        return self.dropout(x)
+
+
+class Embeddings(nn.Layer):
+    def __init__(self, d_model, vocab, padding_idx=None, scale_embedding=True):
+        super(Embeddings, self).__init__()
+        self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx)
+        w0 = np.random.normal(0.0, d_model**-0.5, (vocab, d_model)).astype(np.float32)
+        self.embedding.weight.set_value(w0)
+        self.d_model = d_model
+        self.scale_embedding = scale_embedding
+
+    def forward(self, x):
+        if self.scale_embedding:
+            x = self.embedding(x)
+            return x * math.sqrt(self.d_model)
+        return self.embedding(x)
+
+
+class Beam:
+    """Beam search"""
+
+    def __init__(self, size, device=False):
+        self.size = size
+        self._done = False
+        # The score for each translation on the beam.
+        self.scores = paddle.zeros((size,), dtype=paddle.float32)
+        self.all_scores = []
+        # The backpointers at each time-step.
+        self.prev_ks = []
+        # The outputs at each time-step.
+        self.next_ys = [paddle.full((size,), 0, dtype=paddle.int64)]
+        self.next_ys[0][0] = 2
+
+    def get_current_state(self):
+        "Get the outputs for the current timestep."
+        return self.get_tentative_hypothesis()
+
+    def get_current_origin(self):
+        "Get the backpointers for the current timestep."
+        return self.prev_ks[-1]
+
+    @property
+    def done(self):
+        return self._done
+
+    def advance(self, word_prob):
+        "Update beam status and check if finished or not."
+        num_words = word_prob.shape[1]
+
+        # Sum the previous scores.
+        if len(self.prev_ks) > 0:
+            beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
+        else:
+            beam_lk = word_prob[0]
+
+        flat_beam_lk = beam_lk.reshape([-1])
+        best_scores, best_scores_id = flat_beam_lk.topk(
+            self.size, 0, True, True
+        )  # 1st sort
+        self.all_scores.append(self.scores)
+        self.scores = best_scores
+        # bestScoresId is flattened as a (beam x word) array,
+        # so we need to calculate which word and beam each score came from
+        prev_k = best_scores_id // num_words
+        self.prev_ks.append(prev_k)
+        self.next_ys.append(best_scores_id - prev_k * num_words)
+        # End condition is when top-of-beam is EOS.
+        if self.next_ys[-1][0] == 3:
+            self._done = True
+            self.all_scores.append(self.scores)
+
+        return self._done
+
+    def sort_scores(self):
+        "Sort the scores."
+        return self.scores, paddle.to_tensor(
+            [i for i in range(int(self.scores.shape[0]))], dtype="int32"
+        )
+
+    def get_the_best_score_and_idx(self):
+        "Get the score of the best in the beam."
+        scores, ids = self.sort_scores()
+        return scores[1], ids[1]
+
+    def get_tentative_hypothesis(self):
+        "Get the decoded sequence for the current timestep."
+        if len(self.next_ys) == 1:
+            dec_seq = self.next_ys[0].unsqueeze(1)
+        else:
+            _, keys = self.sort_scores()
+            hyps = [self.get_hypothesis(k) for k in keys]
+            hyps = [[2] + h for h in hyps]
+            dec_seq = paddle.to_tensor(hyps, dtype="int64")
+        return dec_seq
+
+    def get_hypothesis(self, k):
+        """Walk back to construct the full hypothesis."""
+        hyp = []
+        for j in range(len(self.prev_ks) - 1, -1, -1):
+            hyp.append(self.next_ys[j + 1][k])
+            k = self.prev_ks[j][k]
+        return list(map(lambda x: x.item(), hyp[::-1]))
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_pphgnetv2.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_pphgnetv2.py
new file mode 100644
index 0000000000..e3b615f5b5
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rec_pphgnetv2.py
@@ -0,0 +1,1414 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/PaddlePaddle/PaddleClas/blob/2f36cab604e439b59d1a854df34ece3b10d888e3/ppcls/arch/backbone/legendary_models/pp_hgnet_v2.py
+"""
+
+from __future__ import absolute_import, division, print_function
+
+from collections import OrderedDict
+from typing import Callable, Dict, List, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, ReLU
+from paddle.nn.initializer import Constant, KaimingNormal
+from paddle.regularizer import L2Decay
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.0)
+ones_ = Constant(value=1.0)
+
+
+class DonutSwinModelOutput(OrderedDict):
+    last_hidden_state = None
+    pooler_output = None
+    hidden_states = None
+    attentions = None
+    reshaped_hidden_states = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = dict(self.items())
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        super().__setitem__(key, value)
+        super().__setattr__(key, value)
+
+    def to_tuple(self):
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+class IdentityBasedConv1x1(nn.Conv2D):
+    def __init__(self, channels, groups=1):
+        super(IdentityBasedConv1x1, self).__init__(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            bias_attr=False,
+        )
+
+        assert channels % groups == 0
+        input_dim = channels // groups
+        id_value = np.zeros((channels, input_dim, 1, 1))
+        for i in range(channels):
+            id_value[i, i % input_dim, 0, 0] = 1
+        self.id_tensor = paddle.to_tensor(id_value)
+        self.weight.set_value(paddle.zeros_like(self.weight))
+
+    def forward(self, input):
+        kernel = self.weight + self.id_tensor
+        result = F.conv2d(
+            input,
+            kernel,
+            None,
+            stride=1,
+            padding=0,
+            dilation=self._dilation,
+            groups=self._groups,
+        )
+        return result
+
+    def get_actual_kernel(self):
+        return self.weight + self.id_tensor
+
+
+class BNAndPad(nn.Layer):
+    def __init__(
+        self,
+        pad_pixels,
+        num_features,
+        epsilon=1e-5,
+        momentum=0.1,
+        last_conv_bias=None,
+        bn=nn.BatchNorm2D,
+    ):
+        super().__init__()
+        self.bn = bn(num_features, momentum=momentum, epsilon=epsilon)
+        self.pad_pixels = pad_pixels
+        self.last_conv_bias = last_conv_bias
+
+    def forward(self, input):
+        output = self.bn(input)
+        if self.pad_pixels > 0:
+            bias = -self.bn._mean
+            if self.last_conv_bias is not None:
+                bias += self.last_conv_bias
+            pad_values = self.bn.bias + self.bn.weight * (
+                bias / paddle.sqrt(self.bn._variance + self.bn._epsilon)
+            )
+            """ pad """
+            # TODO: n,h,w,c format is not supported yet
+            n, c, h, w = output.shape
+            values = pad_values.reshape([1, -1, 1, 1])
+            w_values = values.expand([n, -1, self.pad_pixels, w])
+            x = paddle.concat([w_values, output, w_values], axis=2)
+            h = h + self.pad_pixels * 2
+            h_values = values.expand([n, -1, h, self.pad_pixels])
+            x = paddle.concat([h_values, x, h_values], axis=3)
+            output = x
+        return output
+
+    @property
+    def weight(self):
+        return self.bn.weight
+
+    @property
+    def bias(self):
+        return self.bn.bias
+
+    @property
+    def _mean(self):
+        return self.bn._mean
+
+    @property
+    def _variance(self):
+        return self.bn._variance
+
+    @property
+    def _epsilon(self):
+        return self.bn._epsilon
+
+
+def conv_bn(
+    in_channels,
+    out_channels,
+    kernel_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    padding_mode="zeros",
+):
+    conv_layer = nn.Conv2D(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        bias_attr=False,
+        padding_mode=padding_mode,
+    )
+    bn_layer = nn.BatchNorm2D(num_features=out_channels)
+    se = nn.Sequential()
+    se.add_sublayer("conv", conv_layer)
+    se.add_sublayer("bn", bn_layer)
+    return se
+
+
+def transI_fusebn(kernel, bn):
+    gamma = bn.weight
+    std = (bn._variance + bn._epsilon).sqrt()
+    return (
+        kernel * ((gamma / std).reshape([-1, 1, 1, 1])),
+        bn.bias - bn._mean * gamma / std,
+    )
+
+
+def transII_addbranch(kernels, biases):
+    return sum(kernels), sum(biases)
+
+
+def transIII_1x1_kxk(k1, b1, k2, b2, groups):
+    if groups == 1:
+        k = F.conv2d(k2, k1.transpose([1, 0, 2, 3]))
+        b_hat = (k2 * b1.reshape([1, -1, 1, 1])).sum((1, 2, 3))
+    else:
+        k_slices = []
+        b_slices = []
+        k1_T = k1.transpose([1, 0, 2, 3])
+        k1_group_width = k1.shape[0] // groups
+        k2_group_width = k2.shape[0] // groups
+        for g in range(groups):
+            k1_T_slice = k1_T[:, g * k1_group_width : (g + 1) * k1_group_width, :, :]
+            k2_slice = k2[g * k2_group_width : (g + 1) * k2_group_width, :, :, :]
+            k_slices.append(F.conv2d(k2_slice, k1_T_slice))
+            b_slices.append(
+                (
+                    k2_slice
+                    * b1[g * k1_group_width : (g + 1) * k1_group_width].reshape(
+                        [1, -1, 1, 1]
+                    )
+                ).sum((1, 2, 3))
+            )
+        k, b_hat = transIV_depthconcat(k_slices, b_slices)
+    return k, b_hat + b2
+
+
+def transIV_depthconcat(kernels, biases):
+    return paddle.cat(kernels, axis=0), paddle.cat(biases)
+
+
+def transV_avg(channels, kernel_size, groups):
+    input_dim = channels // groups
+    k = paddle.zeros((channels, input_dim, kernel_size, kernel_size))
+    k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = (
+        1.0 / kernel_size**2
+    )
+    return k
+
+
+def transVI_multiscale(kernel, target_kernel_size):
+    H_pixels_to_pad = (target_kernel_size - kernel.shape[2]) // 2
+    W_pixels_to_pad = (target_kernel_size - kernel.shape[3]) // 2
+    return F.pad(
+        kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad]
+    )
+
+
+class DiverseBranchBlock(nn.Layer):
+    def __init__(
+        self,
+        num_channels,
+        num_filters,
+        filter_size,
+        stride=1,
+        groups=1,
+        act=None,
+        is_repped=False,
+        single_init=False,
+        **kwargs,
+    ):
+        super().__init__()
+
+        padding = (filter_size - 1) // 2
+        dilation = 1
+
+        in_channels = num_channels
+        out_channels = num_filters
+        kernel_size = filter_size
+        internal_channels_1x1_3x3 = None
+        nonlinear = act
+
+        self.is_repped = is_repped
+
+        if nonlinear is None:
+            self.nonlinear = nn.Identity()
+        else:
+            self.nonlinear = nn.ReLU()
+
+        self.kernel_size = kernel_size
+        self.out_channels = out_channels
+        self.groups = groups
+        assert padding == kernel_size // 2
+
+        if is_repped:
+            self.dbb_reparam = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=True,
+            )
+        else:
+            self.dbb_origin = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            )
+
+            self.dbb_avg = nn.Sequential()
+            if groups < out_channels:
+                self.dbb_avg.add_sublayer(
+                    "conv",
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=groups,
+                        bias_attr=False,
+                    ),
+                )
+                self.dbb_avg.add_sublayer(
+                    "bn", BNAndPad(pad_pixels=padding, num_features=out_channels)
+                )
+                self.dbb_avg.add_sublayer(
+                    "avg",
+                    nn.AvgPool2D(kernel_size=kernel_size, stride=stride, padding=0),
+                )
+                self.dbb_1x1 = conv_bn(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    groups=groups,
+                )
+            else:
+                self.dbb_avg.add_sublayer(
+                    "avg",
+                    nn.AvgPool2D(
+                        kernel_size=kernel_size, stride=stride, padding=padding
+                    ),
+                )
+
+            self.dbb_avg.add_sublayer("avgbn", nn.BatchNorm2D(out_channels))
+
+            if internal_channels_1x1_3x3 is None:
+                internal_channels_1x1_3x3 = (
+                    in_channels if groups < out_channels else 2 * in_channels
+                )  # For mobilenet, it is better to have 2X internal channels
+
+            self.dbb_1x1_kxk = nn.Sequential()
+            if internal_channels_1x1_3x3 == in_channels:
+                self.dbb_1x1_kxk.add_sublayer(
+                    "idconv1", IdentityBasedConv1x1(channels=in_channels, groups=groups)
+                )
+            else:
+                self.dbb_1x1_kxk.add_sublayer(
+                    "conv1",
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=internal_channels_1x1_3x3,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=groups,
+                        bias_attr=False,
+                    ),
+                )
+            self.dbb_1x1_kxk.add_sublayer(
+                "bn1",
+                BNAndPad(pad_pixels=padding, num_features=internal_channels_1x1_3x3),
+            )
+            self.dbb_1x1_kxk.add_sublayer(
+                "conv2",
+                nn.Conv2D(
+                    in_channels=internal_channels_1x1_3x3,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=0,
+                    groups=groups,
+                    bias_attr=False,
+                ),
+            )
+            self.dbb_1x1_kxk.add_sublayer("bn2", nn.BatchNorm2D(out_channels))
+
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.
+        if single_init:
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.
+            self.single_init()
+
+    def forward(self, inputs):
+        if self.is_repped:
+            return self.nonlinear(self.dbb_reparam(inputs))
+
+        out = self.dbb_origin(inputs)
+        if hasattr(self, "dbb_1x1"):
+            out += self.dbb_1x1(inputs)
+        out += self.dbb_avg(inputs)
+        out += self.dbb_1x1_kxk(inputs)
+        return self.nonlinear(out)
+
+    def init_gamma(self, gamma_value):
+        if hasattr(self, "dbb_origin"):
+            paddle.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1"):
+            paddle.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)
+        if hasattr(self, "dbb_avg"):
+            paddle.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):
+            paddle.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)
+
+    def single_init(self):
+        self.init_gamma(0.0)
+        if hasattr(self, "dbb_origin"):
+            paddle.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)
+
+    def get_equivalent_kernel_bias(self):
+        k_origin, b_origin = transI_fusebn(
+            self.dbb_origin.conv.weight, self.dbb_origin.bn
+        )
+
+        if hasattr(self, "dbb_1x1"):
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn)
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)
+        else:
+            k_1x1, b_1x1 = 0, 0
+
+        if hasattr(self.dbb_1x1_kxk, "idconv1"):
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()
+        else:
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(
+            k_1x1_kxk_first, self.dbb_1x1_kxk.bn1
+        )
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(
+            self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2
+        )
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(
+            k_1x1_kxk_first,
+            b_1x1_kxk_first,
+            k_1x1_kxk_second,
+            b_1x1_kxk_second,
+            groups=self.groups,
+        )
+
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg, self.dbb_avg.avgbn)
+        if hasattr(self.dbb_avg, "conv"):
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(
+                self.dbb_avg.conv.weight, self.dbb_avg.bn
+            )
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(
+                k_1x1_avg_first,
+                b_1x1_avg_first,
+                k_1x1_avg_second,
+                b_1x1_avg_second,
+                groups=self.groups,
+            )
+        else:
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second
+
+        return transII_addbranch(
+            (k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged),
+            (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged),
+        )
+
+    def re_parameterize(self):
+        if self.is_repped:
+            return
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.dbb_reparam = nn.Conv2D(
+            in_channels=self.dbb_origin.conv._in_channels,
+            out_channels=self.dbb_origin.conv._out_channels,
+            kernel_size=self.dbb_origin.conv._kernel_size,
+            stride=self.dbb_origin.conv._stride,
+            padding=self.dbb_origin.conv._padding,
+            dilation=self.dbb_origin.conv._dilation,
+            groups=self.dbb_origin.conv._groups,
+            bias_attr=True,
+        )
+
+        self.dbb_reparam.weight.set_value(kernel)
+        self.dbb_reparam.bias.set_value(bias)
+
+        self.__delattr__("dbb_origin")
+        self.__delattr__("dbb_avg")
+        if hasattr(self, "dbb_1x1"):
+            self.__delattr__("dbb_1x1")
+        self.__delattr__("dbb_1x1_kxk")
+        self.is_repped = True
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class TheseusLayer(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.res_dict = {}
+        self.res_name = self.full_name()
+        self.pruner = None
+        self.quanter = None
+
+        self.init_net(*args, **kwargs)
+
+    def _return_dict_hook(self, layer, input, output):
+        res_dict = {"logits": output}
+        # 'list' is needed to avoid error raised by popping self.res_dict
+        for res_key in list(self.res_dict):
+            # clear the res_dict because the forward process may change according to input
+            res_dict[res_key] = self.res_dict.pop(res_key)
+        return res_dict
+
+    def init_net(
+        self,
+        stages_pattern=None,
+        return_patterns=None,
+        return_stages=None,
+        freeze_befor=None,
+        stop_after=None,
+        *args,
+        **kwargs,
+    ):
+        # init the output of net
+        if return_patterns or return_stages:
+            if return_patterns and return_stages:
+                msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+
+                return_stages = None
+
+            if return_stages is True:
+                return_patterns = stages_pattern
+
+            # return_stages is int or bool
+            if type(return_stages) is int:
+                return_stages = [return_stages]
+            if isinstance(return_stages, list):
+                if max(return_stages) > len(stages_pattern) or min(return_stages) < 0:
+                    msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+
+                    return_stages = [
+                        val
+                        for val in return_stages
+                        if val >= 0 and val < len(stages_pattern)
+                    ]
+                return_patterns = [stages_pattern[i] for i in return_stages]
+
+            if return_patterns:
+                # call update_res function after the __init__ of the object has completed execution, that is, the constructing of layer or model has been completed.
+                def update_res_hook(layer, input):
+                    self.update_res(return_patterns)
+
+                self.register_forward_pre_hook(update_res_hook)
+
+        # freeze subnet
+        if freeze_befor is not None:
+            self.freeze_befor(freeze_befor)
+
+        # set subnet to Identity
+        if stop_after is not None:
+            self.stop_after(stop_after)
+
+    def init_res(self, stages_pattern, return_patterns=None, return_stages=None):
+
+        if return_patterns and return_stages:
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(return_stages) < 0:
+                return_stages = [
+                    val
+                    for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]
+
+        if return_patterns:
+            self.update_res(return_patterns)
+
+    def replace_sub(self, *args, **kwargs) -> None:
+        msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
+        raise DeprecationWarning(msg)
+
+    def upgrade_sublayer(
+        self,
+        layer_name_pattern: Union[str, List[str]],
+        handle_func: Callable[[nn.Layer, str], nn.Layer],
+    ) -> Dict[str, nn.Layer]:
+        """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'.
+
+        Args:
+            layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'.
+            handle_func (Callable[[nn.Layer, str], nn.Layer]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Layer) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed.
+
+        Returns:
+            Dict[str, nn.Layer]: The key is the pattern and corresponding value is the result returned by 'handle_func()'.
+
+        Examples:
+
+            from paddle import nn
+            import paddleclas
+
+            def rep_func(layer: nn.Layer, pattern: str):
+                new_layer = nn.Conv2D(
+                    in_channels=layer._in_channels,
+                    out_channels=layer._out_channels,
+                    kernel_size=5,
+                    padding=2
+                )
+                return new_layer
+
+            net = paddleclas.MobileNetV1()
+            res = net.upgrade_sublayer(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func)
+            print(res)
+            # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer}
+        """
+
+        if not isinstance(layer_name_pattern, list):
+            layer_name_pattern = [layer_name_pattern]
+
+        hit_layer_pattern_list = []
+        for pattern in layer_name_pattern:
+            # parse pattern to find target layer and its parent
+            layer_list = parse_pattern_str(pattern=pattern, parent_layer=self)
+            if not layer_list:
+                continue
+
+            sub_layer_parent = layer_list[-2]["layer"] if len(layer_list) > 1 else self
+            sub_layer = layer_list[-1]["layer"]
+            sub_layer_name = layer_list[-1]["name"]
+            sub_layer_index_list = layer_list[-1]["index_list"]
+
+            new_sub_layer = handle_func(sub_layer, pattern)
+
+            if sub_layer_index_list:
+                if len(sub_layer_index_list) > 1:
+                    sub_layer_parent = getattr(sub_layer_parent, sub_layer_name)[
+                        sub_layer_index_list[0]
+                    ]
+                    for sub_layer_index in sub_layer_index_list[1:-1]:
+                        sub_layer_parent = sub_layer_parent[sub_layer_index]
+                    sub_layer_parent[sub_layer_index_list[-1]] = new_sub_layer
+                else:
+                    getattr(sub_layer_parent, sub_layer_name)[
+                        sub_layer_index_list[0]
+                    ] = new_sub_layer
+            else:
+                setattr(sub_layer_parent, sub_layer_name, new_sub_layer)
+
+            hit_layer_pattern_list.append(pattern)
+        return hit_layer_pattern_list
+
+    def stop_after(self, stop_layer_name: str) -> bool:
+        """stop forward and backward after 'stop_layer_name'.
+
+        Args:
+            stop_layer_name (str): The name of layer that stop forward and backward after this layer.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        layer_list = parse_pattern_str(stop_layer_name, self)
+        if not layer_list:
+            return False
+
+        parent_layer = self
+        for layer_dict in layer_list:
+            name, index_list = layer_dict["name"], layer_dict["index_list"]
+            if not set_identity(parent_layer, name, index_list):
+                msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'."
+                return False
+            parent_layer = layer_dict["layer"]
+
+        return True
+
+    def freeze_befor(self, layer_name: str) -> bool:
+        """freeze the layer named layer_name and its previous layer.
+
+        Args:
+            layer_name (str): The name of layer that would be freezed.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        def stop_grad(layer, pattern):
+            class StopGradLayer(nn.Layer):
+                def __init__(self):
+                    super().__init__()
+                    self.layer = layer
+
+                def forward(self, x):
+                    x = self.layer(x)
+                    x.stop_gradient = True
+                    return x
+
+            new_layer = StopGradLayer()
+            return new_layer
+
+        res = self.upgrade_sublayer(layer_name, stop_grad)
+        if len(res) == 0:
+            msg = "Failed to stop the gradient before the layer named '{layer_name}'"
+            return False
+        return True
+
+    def update_res(self, return_patterns: Union[str, List[str]]) -> Dict[str, nn.Layer]:
+        """update the result(s) to be returned.
+
+        Args:
+            return_patterns (Union[str, List[str]]): The name of layer to return output.
+
+        Returns:
+            Dict[str, nn.Layer]: The pattern(str) and corresponding layer(nn.Layer) that have been set successfully.
+        """
+
+        # clear res_dict that could have been set
+        self.res_dict = {}
+
+        class Handler(object):
+            def __init__(self, res_dict):
+                # res_dict is a reference
+                self.res_dict = res_dict
+
+            def __call__(self, layer, pattern):
+                layer.res_dict = self.res_dict
+                layer.res_name = pattern
+                if hasattr(layer, "hook_remove_helper"):
+                    layer.hook_remove_helper.remove()
+                layer.hook_remove_helper = layer.register_forward_post_hook(
+                    save_sub_res_hook
+                )
+                return layer
+
+        handle_func = Handler(self.res_dict)
+
+        hit_layer_pattern_list = self.upgrade_sublayer(
+            return_patterns, handle_func=handle_func
+        )
+
+        if hasattr(self, "hook_remove_helper"):
+            self.hook_remove_helper.remove()
+        self.hook_remove_helper = self.register_forward_post_hook(
+            self._return_dict_hook
+        )
+
+        return hit_layer_pattern_list
+
+
+def save_sub_res_hook(layer, input, output):
+    layer.res_dict[layer.res_name] = output
+
+
+def set_identity(
+    parent_layer: nn.Layer, layer_name: str, layer_index_list: str = None
+) -> bool:
+    """set the layer specified by layer_name and layer_index_list to Identity.
+
+    Args:
+        parent_layer (nn.Layer): The parent layer of target layer specified by layer_name and layer_index_list.
+        layer_name (str): The name of target layer to be set to Identity.
+        layer_index_list (str, optional): The index of target layer to be set to Identity in parent_layer. Defaults to None.
+
+    Returns:
+        bool: True if successfully, False otherwise.
+    """
+
+    stop_after = False
+    for sub_layer_name in parent_layer._sub_layers:
+        if stop_after:
+            parent_layer._sub_layers[sub_layer_name] = Identity()
+            continue
+        if sub_layer_name == layer_name:
+            stop_after = True
+
+    if layer_index_list and stop_after:
+        layer_container = parent_layer._sub_layers[layer_name]
+        for num, layer_index in enumerate(layer_index_list):
+            stop_after = False
+            for i in range(num):
+                layer_container = layer_container[layer_index_list[i]]
+            for sub_layer_index in layer_container._sub_layers:
+                if stop_after:
+                    parent_layer._sub_layers[layer_name][sub_layer_index] = Identity()
+                    continue
+                if layer_index == sub_layer_index:
+                    stop_after = True
+
+    return stop_after
+
+
+def parse_pattern_str(
+    pattern: str, parent_layer: nn.Layer
+) -> Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]:
+    """parse the string type pattern.
+
+    Args:
+        pattern (str): The pattern to describe layer.
+        parent_layer (nn.Layer): The root layer relative to the pattern.
+
+    Returns:
+        Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order:
+                                                                [
+                                                                    {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist},
+                                                                    {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist},
+                                                                    ...
+                                                                ]
+    """
+
+    pattern_list = pattern.split(".")
+    if not pattern_list:
+        msg = f"The pattern('{pattern}') is illegal. Please check and retry."
+        return None
+
+    layer_list = []
+    while len(pattern_list) > 0:
+        if "[" in pattern_list[0]:
+            target_layer_name = pattern_list[0].split("[")[0]
+            target_layer_index_list = list(
+                index.split("]")[0] for index in pattern_list[0].split("[")[1:]
+            )
+        else:
+            target_layer_name = pattern_list[0]
+            target_layer_index_list = None
+
+        target_layer = getattr(parent_layer, target_layer_name, None)
+
+        if target_layer is None:
+            msg = f"Not found layer named('{target_layer_name}') specified in pattern('{pattern}')."
+            return None
+
+        if target_layer_index_list:
+            for target_layer_index in target_layer_index_list:
+                if int(target_layer_index) < 0 or int(target_layer_index) >= len(
+                    target_layer
+                ):
+                    msg = f"Not found layer by index('{target_layer_index}') specified in pattern('{pattern}'). The index should < {len(target_layer)} and > 0."
+                    return None
+                target_layer = target_layer[target_layer_index]
+
+        layer_list.append(
+            {
+                "layer": target_layer,
+                "name": target_layer_name,
+                "index_list": target_layer_index_list,
+            }
+        )
+
+        pattern_list = pattern_list[1:]
+        parent_layer = target_layer
+
+    return layer_list
+
+
+class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if paddle.device.get_device().startswith("npu"):
+            self.device = "npu"
+        else:
+            self.device = None
+
+        if isinstance(self._output_size, int) and self._output_size == 1:
+            self._gap = True
+        elif (
+            isinstance(self._output_size, tuple)
+            and self._output_size[0] == 1
+            and self._output_size[1] == 1
+        ):
+            self._gap = True
+        else:
+            self._gap = False
+
+    def forward(self, x):
+        if self.device == "npu" and self._gap:
+            # Global Average Pooling
+            N, C, _, _ = x.shape
+            x_mean = paddle.mean(x, axis=[2, 3])
+            x_mean = paddle.reshape(x_mean, [N, C, 1, 1])
+            return x_mean
+        else:
+            return F.adaptive_avg_pool2d(
+                x,
+                output_size=self._output_size,
+                data_format=self._data_format,
+                name=self._name,
+            )
+
+
+class LearnableAffineBlock(TheseusLayer):
+    """
+    Create a learnable affine block module. This module can significantly improve accuracy on smaller models.
+
+    Args:
+        scale_value (float): The initial value of the scale parameter, default is 1.0.
+        bias_value (float): The initial value of the bias parameter, default is 0.0.
+        lr_mult (float): The learning rate multiplier, default is 1.0.
+        lab_lr (float): The learning rate, default is 0.01.
+    """
+
+    def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[
+                1,
+            ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr),
+        )
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(TheseusLayer):
+    """
+    ConvBNAct is a combination of convolution and batchnorm layers.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int/str): Padding or padding type for the convolution. Defaults to 1.
+        groups (int): Number of groups for the convolution. Defaults to 1.
+        use_act: (bool): Whether to use activation function. Defaults to True.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        groups=1,
+        use_act=True,
+        use_lab=False,
+        lr_mult=1.0,
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+        )
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult),
+        )
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(TheseusLayer):
+    """
+    LightConvBNAct is a combination of pw and dw layers.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (int): Size of the depth-wise convolution kernel.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        use_lab=False,
+        lr_mult=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(TheseusLayer):
+    """
+    StemBlock for PP-HGNetV2.
+
+    Args:
+        in_channels (int): Number of input channels.
+        mid_channels (int): Number of middle channels.
+        out_channels (int): Number of output channels.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        mid_channels,
+        out_channels,
+        use_lab=False,
+        lr_mult=1.0,
+        text_rec=False,
+    ):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=1 if text_rec else 2,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME"
+        )
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HGV2_Block(TheseusLayer):
+    """
+    HGV2_Block, the basic unit that constitutes the HGV2_Stage.
+
+    Args:
+        in_channels (int): Number of input channels.
+        mid_channels (int): Number of middle channels.
+        out_channels (int): Number of output channels.
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int/str): Padding or padding type for the convolution. Defaults to 1.
+        groups (int): Number of groups for the convolution. Defaults to 1.
+        use_act (bool): Whether to use activation function. Defaults to True.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        mid_channels,
+        out_channels,
+        kernel_size=3,
+        layer_num=6,
+        identity=False,
+        light_block=True,
+        use_lab=False,
+        lr_mult=1.0,
+    ):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(
+                    in_channels=in_channels if i == 0 else mid_channels,
+                    out_channels=mid_channels,
+                    stride=1,
+                    kernel_size=kernel_size,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult,
+                )
+            )
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult,
+        )
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HGV2_Stage(TheseusLayer):
+    """
+    HGV2_Stage, the basic unit that constitutes the PPHGNetV2.
+
+    Args:
+        in_channels (int): Number of input channels.
+        mid_channels (int): Number of middle channels.
+        out_channels (int): Number of output channels.
+        block_num (int): Number of blocks in the HGV2 stage.
+        layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
+        is_downsample (bool): Whether to use downsampling operation. Defaults to False.
+        light_block (bool): Whether to use light block. Defaults to True.
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        use_lab (bool, optional): Whether to use the LAB operation. Defaults to False.
+        lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        mid_channels,
+        out_channels,
+        block_num,
+        layer_num=6,
+        is_downsample=True,
+        light_block=True,
+        kernel_size=3,
+        use_lab=False,
+        stride=2,
+        lr_mult=1.0,
+    ):
+
+        super().__init__()
+        self.is_downsample = is_downsample
+        if self.is_downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=stride,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult,
+            )
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HGV2_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.is_downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class PPHGNetV2(TheseusLayer):
+    """
+    PPHGNetV2
+
+    Args:
+        stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc.
+        stem_channels: (list): Number of channels of the stem of the PPHGNetV2.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.
+        use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True.
+        class_expand (int): Number of channels for the last 1x1 convolutional layer.
+        drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0.
+        class_num (int): The number of classes for the classification layer. Defaults to 1000.
+        lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0].
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    def __init__(
+        self,
+        stage_config,
+        stem_channels=[3, 32, 64],
+        use_lab=False,
+        use_last_conv=True,
+        class_expand=2048,
+        dropout_prob=0.0,
+        class_num=1000,
+        lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+        det=False,
+        text_rec=False,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.det = det
+        self.text_rec = text_rec
+        self.use_lab = use_lab
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3]
+        self.out_channels = []
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0],
+            text_rec=text_rec,
+        )
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            (
+                in_channels,
+                mid_channels,
+                out_channels,
+                block_num,
+                is_downsample,
+                light_block,
+                kernel_size,
+                layer_num,
+                stride,
+            ) = stage_config[k]
+            self.stages.append(
+                HGV2_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    is_downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    stride,
+                    lr_mult=lr_mult_list[i + 1],
+                )
+            )
+            if i in self.out_indices:
+                self.out_channels.append(out_channels)
+        if not self.det:
+            self.out_channels = stage_config["stage4"][2]
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=out_channels,
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False,
+            )
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock()
+            self.dropout = nn.Dropout(p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        if not self.det:
+            self.fc = nn.Linear(
+                self.class_expand if self.use_last_conv else out_channels,
+                self.class_num,
+            )
+
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+        out = []
+        for i, stage in enumerate(self.stages):
+            x = stage(x)
+            if self.det and i in self.out_indices:
+                out.append(x)
+        if self.det:
+            return out
+
+        if self.text_rec:
+            if self.training:
+                x = F.adaptive_avg_pool2d(x, [1, 40])
+            else:
+                x = F.avg_pool2d(x, [3, 2])
+        return x
diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rnn.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rnn.py
new file mode 100644
index 0000000000..af097b764c
--- /dev/null
+++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec_modules/rnn.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+from paddle import nn
+from paddle.nn.initializer import Constant, TruncatedNormal
+
+from .rec_ctc_head import get_para_bias_attr
+
+trunc_normal_ = TruncatedNormal(std=0.02)
+zeros_ = Constant(value=0.0)
+ones_ = Constant(value=1.0)
+
+
+class Mlp(nn.Layer):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Attention(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        mixer="Global",
+        HW=None,
+        local_k=[7, 11],
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.HW = HW
+        if HW is not None:
+            H = HW[0]
+            W = HW[1]
+            self.N = H * W
+            self.C = dim
+        if mixer == "Local" and HW is not None:
+            hk = local_k[0]
+            wk = local_k[1]
+            mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype="float32")
+            for h in range(0, H):
+                for w in range(0, W):
+                    mask[h * W + w, h : h + hk, w : w + wk] = 0.0
+            mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten(
+                1
+            )
+            mask_inf = paddle.full([H * W, H * W], "-inf", dtype="float32")
+            mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf)
+            self.mask = mask.unsqueeze([0, 1])
+        self.mixer = mixer
+
+    def forward(self, x):
+        qkv = (
+            self.qkv(x)
+            .reshape((0, -1, 3, self.num_heads, self.head_dim))
+            .transpose((2, 0, 3, 1, 4))
+        )
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+
+        attn = q.matmul(k.transpose((0, 1, 3, 2)))
+        if self.mixer == "Local":
+            attn += self.mask
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mixer="Global",
+        local_mixer=[7, 11],
+        HW=None,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer="nn.LayerNorm",
+        epsilon=1e-6,
+        prenorm=True,
+    ):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm1 = norm_layer(dim)
+        if mixer == "Global" or mixer == "Local":
+            self.mixer = Attention(
+                dim,
+                num_heads=num_heads,
+                mixer=mixer,
+                HW=HW,
+                local_k=local_mixer,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+            )
+        elif mixer == "Conv":
+            self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
+        else:
+            raise TypeError("The mixer must be one of [Global, Local, Conv]")
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp_ratio = mlp_ratio
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.prenorm = prenorm
+
+    def forward(self, x):
+        if self.prenorm:
+            x = self.norm1(x + self.drop_path(self.mixer(x)))
+            x = self.norm2(x + self.drop_path(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.mixer(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        bias_attr=False,
+        groups=1,
+        act=nn.GELU,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
+            bias_attr=bias_attr,
+        )
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.act = act()
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+
+
+class Im2Seq(nn.Layer):
+    def __init__(self, in_channels, **kwargs):
+        super().__init__()
+        self.out_channels = in_channels
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == 1
+        x = x.squeeze(axis=2)
+        x = x.transpose([0, 2, 1])  # (NTC)(batch, width, channels)
+        return x
+
+
+class EncoderWithRNN(nn.Layer):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithRNN, self).__init__()
+        self.out_channels = hidden_size * 2
+        self.lstm = nn.LSTM(
+            in_channels, hidden_size, direction="bidirectional", num_layers=2
+        )
+
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        return x
+
+
+class BidirectionalLSTM(nn.Layer):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        output_size=None,
+        num_layers=1,
+        dropout=0,
+        direction=False,
+        time_major=False,
+        with_linear=False,
+    ):
+        super(BidirectionalLSTM, self).__init__()
+        self.with_linear = with_linear
+        self.rnn = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            direction=direction,
+            time_major=time_major,
+        )
+
+        # text recognition the specified structure LSTM with linear
+        if self.with_linear:
+            self.linear = nn.Linear(hidden_size * 2, output_size)
+
+    def forward(self, input_feature):
+        recurrent, _ = self.rnn(
+            input_feature
+        )  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
+        if self.with_linear:
+            output = self.linear(recurrent)  # batch_size x T x output_size
+            return output
+        return recurrent
+
+
+class EncoderWithCascadeRNN(nn.Layer):
+    def __init__(
+        self, in_channels, hidden_size, out_channels, num_layers=2, with_linear=False
+    ):
+        super(EncoderWithCascadeRNN, self).__init__()
+        self.out_channels = out_channels[-1]
+        self.encoder = nn.LayerList(
+            [
+                BidirectionalLSTM(
+                    in_channels if i == 0 else out_channels[i - 1],
+                    hidden_size,
+                    output_size=out_channels[i],
+                    num_layers=1,
+                    direction="bidirectional",
+                    with_linear=with_linear,
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+    def forward(self, x):
+        for i, l in enumerate(self.encoder):
+            x = l(x)
+        return x
+
+
+class EncoderWithFC(nn.Layer):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithFC, self).__init__()
+        self.out_channels = hidden_size
+        weight_attr, bias_attr = get_para_bias_attr(l2_decay=0.00001, k=in_channels)
+        self.fc = nn.Linear(
+            in_channels,
+            hidden_size,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            name="reduce_encoder_fea",
+        )
+
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+
+
+class EncoderWithSVTR(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        dims=64,  # XS
+        depth=2,
+        hidden_dims=120,
+        use_guide=False,
+        num_heads=8,
+        qkv_bias=True,
+        mlp_ratio=2.0,
+        drop_rate=0.1,
+        attn_drop_rate=0.1,
+        drop_path=0.0,
+        kernel_size=[3, 3],
+        qk_scale=None,
+    ):
+        super(EncoderWithSVTR, self).__init__()
+        self.depth = depth
+        self.use_guide = use_guide
+        self.conv1 = ConvBNLayer(
+            in_channels,
+            in_channels // 8,
+            kernel_size=kernel_size,
+            padding=[kernel_size[0] // 2, kernel_size[1] // 2],
+            act=nn.Swish,
+        )
+        self.conv2 = ConvBNLayer(
+            in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish
+        )
+
+        self.svtr_block = nn.LayerList(
+            [
+                Block(
+                    dim=hidden_dims,
+                    num_heads=num_heads,
+                    mixer="Global",
+                    HW=None,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=nn.Swish,
+                    attn_drop=attn_drop_rate,
+                    drop_path=drop_path,
+                    norm_layer="nn.LayerNorm",
+                    epsilon=1e-05,
+                    prenorm=False,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = nn.LayerNorm(hidden_dims, epsilon=1e-6)
+        self.conv3 = ConvBNLayer(hidden_dims, in_channels, kernel_size=1, act=nn.Swish)
+        # last conv-nxn, the input is concat of input tensor and conv3 output tensor
+        self.conv4 = ConvBNLayer(
+            2 * in_channels,
+            in_channels // 8,
+            kernel_size=kernel_size,
+            padding=[kernel_size[0] // 2, kernel_size[1] // 2],
+            act=nn.Swish,
+        )
+
+        self.conv1x1 = ConvBNLayer(in_channels // 8, dims, kernel_size=1, act=nn.Swish)
+        self.out_channels = dims
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        # for use guide
+        if self.use_guide:
+            z = x.clone()
+            z.stop_gradient = True
+        else:
+            z = x
+        # for short cut
+        h = z
+        # reduce dim
+        z = self.conv1(z)
+        z = self.conv2(z)
+        # SVTR global block
+        B, C, H, W = z.shape
+        z = z.flatten(2).transpose([0, 2, 1])
+        for blk in self.svtr_block:
+            z = blk(z)
+        z = self.norm(z)
+        # last stage
+        z = z.reshape([0, H, W, C]).transpose([0, 3, 1, 2])
+        z = self.conv3(z)
+        z = paddle.concat((h, z), axis=1)
+        z = self.conv1x1(self.conv4(z))
+        return z
+
+
+class SequenceEncoder(nn.Layer):
+    def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs):
+        super(SequenceEncoder, self).__init__()
+        self.encoder_reshape = Im2Seq(in_channels)
+        self.out_channels = self.encoder_reshape.out_channels
+        self.encoder_type = encoder_type
+        if encoder_type == "reshape":
+            self.only_reshape = True
+        else:
+            support_encoder_dict = {
+                "reshape": Im2Seq,
+                "fc": EncoderWithFC,
+                "rnn": EncoderWithRNN,
+                "svtr": EncoderWithSVTR,
+                "cascadernn": EncoderWithCascadeRNN,
+            }
+            assert encoder_type in support_encoder_dict, "{} must in {}".format(
+                encoder_type, support_encoder_dict.keys()
+            )
+            if encoder_type == "svtr":
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, **kwargs
+                )
+            elif encoder_type == "cascadernn":
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, hidden_size, **kwargs
+                )
+            else:
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, hidden_size
+                )
+            self.out_channels = self.encoder.out_channels
+            self.only_reshape = False
+
+    def forward(self, x):
+        if self.encoder_type != "svtr":
+            x = self.encoder_reshape(x)
+            if not self.only_reshape:
+                x = self.encoder(x)
+            return x
+        else:
+            x = self.encoder(x)
+            x = self.encoder_reshape(x)
+            return x
diff --git a/paddlex/inference/models/text_recognition/predictor.py b/paddlex/inference/models/text_recognition/predictor.py
index 1dcc2f5b87..01141e86be 100644
--- a/paddlex/inference/models/text_recognition/predictor.py
+++ b/paddlex/inference/models/text_recognition/predictor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 from ....modules.text_recognition.model_list import MODELS
 from ....utils.deps import class_requires_deps, is_dep_available
+from ....utils.device import TemporaryDeviceChanger
 from ....utils.fonts import (
     ARABIC_FONT,
     CYRILLIC_FONT,
@@ -52,6 +53,7 @@ def __init__(self, *args, input_shape=None, return_word_box=False, **kwargs):
         super().__init__(*args, **kwargs)
         self.input_shape = input_shape
         self.return_word_box = return_word_box
+        self.device = kwargs.get("device", None)
         self.vis_font = self.get_vis_font()
         self.pre_tfs, self.infer, self.post_op = self._build()
 
@@ -73,7 +75,24 @@ def _build(self):
                 pre_tfs[name] = op
         pre_tfs["ToBatch"] = ToBatch()
 
-        infer = self.create_static_infer()
+        if self._use_static_model:
+            infer = self.create_static_infer()
+        else:
+            if self.model_name in ["PP-OCRv5_mobile_rec", "PP-OCRv5_server_rec"]:
+                from .modeling import PPOCRV5Rec
+
+                with TemporaryDeviceChanger(self.device):
+                    infer = PPOCRV5Rec.from_pretrained(
+                        self.model_dir,
+                        use_safetensors=True,
+                        convert_from_hf=True,
+                        dtype="float32",
+                    )
+                    infer.eval()
+            else:
+                raise RuntimeError(
+                    f"There is no dynamic graph implementation for model {repr(self.model_name)}."
+                )
 
         post_op = self.build_postprocess(**self.config["PostProcess"])
         return pre_tfs, infer, post_op
@@ -86,7 +105,11 @@ def process(self, batch_data, return_word_box=False):
         indices = np.argsort(np.array(width_list))
         batch_imgs = self.pre_tfs["ReisizeNorm"](imgs=batch_raw_imgs)
         x = self.pre_tfs["ToBatch"](imgs=batch_imgs)
-        batch_preds = self.infer(x=x)
+        if self._use_static_model:
+            batch_preds = self.infer(x=x)
+        else:
+            with TemporaryDeviceChanger(self.device):
+                batch_preds = self.infer(x=x)
         batch_num = self.batch_sampler.batch_size
         img_num = len(batch_raw_imgs)
         rec_image_shape = next(
diff --git a/paddlex/inference/models/text_to_pinyin/__init__.py b/paddlex/inference/models/text_to_pinyin/__init__.py
new file mode 100644
index 0000000000..9c05228964
--- /dev/null
+++ b/paddlex/inference/models/text_to_pinyin/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .predictor import TextToPinyinPredictor
diff --git a/paddlex/inference/models/text_to_pinyin/predictor.py b/paddlex/inference/models/text_to_pinyin/predictor.py
new file mode 100644
index 0000000000..73cbd34447
--- /dev/null
+++ b/paddlex/inference/models/text_to_pinyin/predictor.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....modules.text_to_pinyin.model_list import MODELS
+from ...common.batch_sampler import TextBatchSampler
+from ..base import BasePredictor
+from .result import TextToPinyinResult
+
+
+class TextToPinyinPredictor(BasePredictor):
+
+    entities = MODELS
+
+    def __init__(self, *args, **kwargs):
+        """Initializes TextSegmentPredictor.
+
+        Args:
+            *args: Arbitrary positional arguments passed to the superclass.
+            **kwargs: Arbitrary keyword arguments passed to the superclass.
+        """
+        super().__init__(*args, **kwargs)
+        self.model = self._build()
+
+    def _build_batch_sampler(self):
+        """Builds and returns an TextBatchSampler instance.
+
+        Returns:
+            TextBatchSampler: An instance of TextBatchSampler.
+        """
+        return TextBatchSampler()
+
+    def _get_result_class(self):
+        """Returns the result class, TextToPinyinResult.
+
+        Returns:
+            type: The TextToPinyinResult class.
+        """
+        return TextToPinyinResult
+
+    def _build(self):
+        """Build the model.
+
+        Returns:
+            G2PWOnnxConverter: An instance of G2PWOnnxConverter.
+        """
+        from .processors import G2PWOnnxConverter
+
+        # build model
+        model = G2PWOnnxConverter(
+            model_dir=self.model_dir, style="pinyin", enable_non_tradional_chinese=True
+        )
+        return model
+
+    def process(self, batch_data):
+        """
+        Process a batch of data through the preprocessing, inference, and postprocessing.
+
+        Args:
+            batch_data (List[Union[str], ...]): A batch of input text data.
+
+        Returns:
+            dict: A dictionary containing the input path and result. The result include the output pinyin dict.
+        """
+        result = self.model(batch_data[0])
+        return {"result": [result]}
diff --git a/paddlex/inference/models/text_to_pinyin/processors.py b/paddlex/inference/models/text_to_pinyin/processors.py
new file mode 100644
index 0000000000..a9b3f7dbfe
--- /dev/null
+++ b/paddlex/inference/models/text_to_pinyin/processors.py
@@ -0,0 +1,846 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Credits
+    This code is modified from https://github.com/GitYCC/g2pW
+"""
+import json
+import os
+import re
+from collections import OrderedDict
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from ....utils.deps import is_dep_available
+from ..common.tokenizer import BertTokenizer
+
+if is_dep_available("pypinyin"):
+    from pypinyin import Style, pinyin
+
+INITIALS = [
+    "b",
+    "p",
+    "m",
+    "f",
+    "d",
+    "t",
+    "n",
+    "l",
+    "g",
+    "k",
+    "h",
+    "zh",
+    "ch",
+    "sh",
+    "r",
+    "z",
+    "c",
+    "s",
+    "j",
+    "q",
+    "x",
+]
+
+FINALS = [
+    "a",
+    "ai",
+    "ao",
+    "an",
+    "ang",
+    "e",
+    "er",
+    "ei",
+    "en",
+    "eng",
+    "o",
+    "ou",
+    "ong",
+    "ii",
+    "iii",
+    "i",
+    "ia",
+    "iao",
+    "ian",
+    "iang",
+    "ie",
+    "io",
+    "iou",
+    "iong",
+    "in",
+    "ing",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "uei",
+    "uo",
+    "uen",
+    "ueng",
+    "v",
+    "ve",
+    "van",
+    "vn",
+]
+
+SPECIALS = ["sil", "sp"]
+simplified_charcters = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉５１鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤"
+
+traditional_characters = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉５１鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤"
+
+assert len(simplified_charcters) == len(simplified_charcters)
+
+s2t_dict = {}
+t2s_dict = {}
+for i, item in enumerate(simplified_charcters):
+    s2t_dict[item] = traditional_characters[i]
+    t2s_dict[traditional_characters[i]] = item
+
+
+def tranditional_to_simplified(text: str) -> str:
+    return "".join([t2s_dict[item] if item in t2s_dict else item for item in text])
+
+
+ANCHOR_CHAR = "▁"
+
+
+def prepare_onnx_input(
+    tokenizer,
+    labels: List[str],
+    char2phonemes: Dict[str, List[int]],
+    chars: List[str],
+    texts: List[str],
+    query_ids: List[int],
+    use_mask: bool = False,
+    window_size: int = None,
+    max_len: int = 512,
+) -> Dict[str, np.array]:
+    if window_size is not None:
+        truncated_texts, truncated_query_ids = _truncate_texts(
+            window_size=window_size, texts=texts, query_ids=query_ids
+        )
+    input_ids = []
+    token_type_ids = []
+    attention_masks = []
+    phoneme_masks = []
+    char_ids = []
+    position_ids = []
+
+    for idx in range(len(texts)):
+        text = (truncated_texts if window_size else texts)[idx].lower()
+        query_id = (truncated_query_ids if window_size else query_ids)[idx]
+
+        try:
+            tokens, text2token, token2text = tokenize_and_map(
+                tokenizer=tokenizer, text=text
+            )
+        except Exception:
+            print(f'warning: text "{text}" is invalid')
+            return {}
+
+        text, query_id, tokens, text2token, token2text = _truncate(
+            max_len=max_len,
+            text=text,
+            query_id=query_id,
+            tokens=tokens,
+            text2token=text2token,
+            token2text=token2text,
+        )
+
+        processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
+
+        input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
+        token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
+        attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
+
+        query_char = text[query_id]
+        phoneme_mask = (
+            [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))]
+            if use_mask
+            else [1] * len(labels)
+        )
+        char_id = chars.index(query_char)
+        position_id = text2token[query_id] + 1  # [CLS] token locate at first place
+
+        input_ids.append(input_id)
+        token_type_ids.append(token_type_id)
+        attention_masks.append(attention_mask)
+        phoneme_masks.append(phoneme_mask)
+        char_ids.append(char_id)
+        position_ids.append(position_id)
+
+    outputs = {
+        "input_ids": np.array(input_ids).astype(np.int64),
+        "token_type_ids": np.array(token_type_ids).astype(np.int64),
+        "attention_masks": np.array(attention_masks).astype(np.int64),
+        "phoneme_masks": np.array(phoneme_masks).astype(np.float32),
+        "char_ids": np.array(char_ids).astype(np.int64),
+        "position_ids": np.array(position_ids).astype(np.int64),
+    }
+    return outputs
+
+
+def _truncate_texts(
+    window_size: int, texts: List[str], query_ids: List[int]
+) -> Tuple[List[str], List[int]]:
+    truncated_texts = []
+    truncated_query_ids = []
+    for text, query_id in zip(texts, query_ids):
+        start = max(0, query_id - window_size // 2)
+        end = min(len(text), query_id + window_size // 2)
+        truncated_text = text[start:end]
+        truncated_texts.append(truncated_text)
+
+        truncated_query_id = query_id - start
+        truncated_query_ids.append(truncated_query_id)
+    return truncated_texts, truncated_query_ids
+
+
+def rule(C, V, R, T):
+    """Generate a syllable given the initial, the final, erhua indicator, and tone.
+    Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
+
+    Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to
+    'u' in syllables when certain conditions are satisfied.
+
+    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
+    When a syllable is impossible or does not have any characters with this pronunciation, return None
+    to filter it out.
+    """
+
+    # 不可拼的音节, ii 只能和 z, c, s 拼
+    if V in ["ii"] and (C not in ["z", "c", "s"]):
+        return None
+    # iii 只能和 zh, ch, sh, r 拼
+    if V in ["iii"] and (C not in ["zh", "ch", "sh", "r"]):
+        return None
+
+    # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s
+    if (
+        (V not in ["ii", "iii"])
+        and V[0] in ["i", "v"]
+        and (C in ["f", "g", "k", "h", "zh", "ch", "sh", "r", "z", "c", "s"])
+    ):
+        return None
+
+    # 撮口呼只能和 j, q, x l, n 拼
+    if V.startswith("v"):
+        # v, ve 只能和 j ,q , x, n, l 拼
+        if V in ["v", "ve"]:
+            if C not in ["j", "q", "x", "n", "l", ""]:
+                return None
+        # 其他只能和 j, q, x 拼
+        else:
+            if C not in ["j", "q", "x", ""]:
+                return None
+
+    # j, q, x 只能和齐齿呼或者撮口呼拼
+    if (C in ["j", "q", "x"]) and not ((V not in ["ii", "iii"]) and V[0] in ["i", "v"]):
+        return None
+
+    # b, p ,m, f 不能和合口呼拼，除了 u 之外
+    # bm p, m, f 不能和撮口呼拼
+    if (C in ["b", "p", "m", "f"]) and (
+        (V[0] in ["u", "v"] and V != "u") or V == "ong"
+    ):
+        return None
+
+    # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
+    if V in ["ua", "uai", "uang"] and C in ["d", "t", "n", "l", "r", "z", "c", "s"]:
+        return None
+
+    # sh 和 ong 不能拼
+    if V == "ong" and C in ["sh"]:
+        return None
+
+    # o 和 gkh, zh ch sh r z c s 不能拼
+    if V == "o" and C in [
+        "d",
+        "t",
+        "n",
+        "g",
+        "k",
+        "h",
+        "zh",
+        "ch",
+        "sh",
+        "r",
+        "z",
+        "c",
+        "s",
+    ]:
+        return None
+
+    # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong
+    if V == "ueng" and C != "":
+        return
+
+    # 非儿化的 er 只能单独存在
+    if V == "er" and C != "":
+        return None
+
+    if C == "":
+        if V in ["i", "in", "ing"]:
+            C = "y"
+        elif V == "u":
+            C = "w"
+        elif V.startswith("i") and V not in ["ii", "iii"]:
+            C = "y"
+            V = V[1:]
+        elif V.startswith("u"):
+            C = "w"
+            V = V[1:]
+        elif V.startswith("v"):
+            C = "yu"
+            V = V[1:]
+    else:
+        if C in ["j", "q", "x"]:
+            if V.startswith("v"):
+                V = re.sub("v", "u", V)
+        if V == "iou":
+            V = "iu"
+        elif V == "uei":
+            V = "ui"
+        elif V == "uen":
+            V = "un"
+    result = C + V
+
+    # Filter  er 不能再儿化
+    if result.endswith("r") and R == "r":
+        return None
+
+    # ii and iii, change back to i
+    result = re.sub(r"i+", "i", result)
+
+    result = result + R + T
+    return result
+
+
+def generate_lexicon(with_tone=False, with_erhua=False):
+    """Generate lexicon for Mandarin Chinese."""
+    syllables = OrderedDict()
+
+    for C in [""] + INITIALS:
+        for V in FINALS:
+            for R in [""] if not with_erhua else ["", "r"]:
+                for T in [""] if not with_tone else ["1", "2", "3", "4", "5"]:
+                    result = rule(C, V, R, T)
+                    if result:
+                        syllables[result] = f"{C} {V}{R}{T}"
+    return syllables
+
+
+def _truncate(
+    max_len: int,
+    text: str,
+    query_id: int,
+    tokens: List[str],
+    text2token: List[int],
+    token2text: List[Tuple[int]],
+):
+    truncate_len = max_len - 2
+    if len(tokens) <= truncate_len:
+        return (text, query_id, tokens, text2token, token2text)
+
+    token_position = text2token[query_id]
+
+    token_start = token_position - truncate_len // 2
+    token_end = token_start + truncate_len
+    font_exceed_dist = -token_start
+    back_exceed_dist = token_end - len(tokens)
+    if font_exceed_dist > 0:
+        token_start += font_exceed_dist
+        token_end += font_exceed_dist
+    elif back_exceed_dist > 0:
+        token_start -= back_exceed_dist
+        token_end -= back_exceed_dist
+
+    start = token2text[token_start][0]
+    end = token2text[token_end - 1][1]
+
+    return (
+        text[start:end],
+        query_id - start,
+        tokens[token_start:token_end],
+        [i - token_start if i is not None else None for i in text2token[start:end]],
+        [(s - start, e - start) for s, e in token2text[token_start:token_end]],
+    )
+
+
+def get_phoneme_labels(
+    polyphonic_chars: List[List[str]],
+) -> Tuple[List[str], Dict[str, List[int]]]:
+    labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
+    char2phonemes = {}
+    for char, phoneme in polyphonic_chars:
+        if char not in char2phonemes:
+            char2phonemes[char] = []
+        char2phonemes[char].append(labels.index(phoneme))
+    return labels, char2phonemes
+
+
+def get_char_phoneme_labels(
+    polyphonic_chars: List[List[str]],
+) -> Tuple[List[str], Dict[str, List[int]]]:
+    labels = sorted(
+        list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars]))
+    )
+    char2phonemes = {}
+    for char, phoneme in polyphonic_chars:
+        if char not in char2phonemes:
+            char2phonemes[char] = []
+        char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
+    return labels, char2phonemes
+
+
+def wordize_and_map(text: str):
+    words = []
+    index_map_from_text_to_word = []
+    index_map_from_word_to_text = []
+    while len(text) > 0:
+        match_space = re.match(r"^ +", text)
+        if match_space:
+            space_str = match_space.group(0)
+            index_map_from_text_to_word += [None] * len(space_str)
+            text = text[len(space_str) :]
+            continue
+
+        match_en = re.match(r"^[a-zA-Z0-9]+", text)
+        if match_en:
+            en_word = match_en.group(0)
+
+            word_start_pos = len(index_map_from_text_to_word)
+            word_end_pos = word_start_pos + len(en_word)
+            index_map_from_word_to_text.append((word_start_pos, word_end_pos))
+
+            index_map_from_text_to_word += [len(words)] * len(en_word)
+
+            words.append(en_word)
+            text = text[len(en_word) :]
+        else:
+            word_start_pos = len(index_map_from_text_to_word)
+            word_end_pos = word_start_pos + 1
+            index_map_from_word_to_text.append((word_start_pos, word_end_pos))
+
+            index_map_from_text_to_word += [len(words)]
+
+            words.append(text[0])
+            text = text[1:]
+    return words, index_map_from_text_to_word, index_map_from_word_to_text
+
+
+def tokenize_and_map(tokenizer, text: str):
+    words, text2word, word2text = wordize_and_map(text=text)
+
+    tokens = []
+    index_map_from_token_to_text = []
+    for word, (word_start, word_end) in zip(words, word2text):
+        word_tokens = tokenizer.tokenize(word)
+
+        if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
+            index_map_from_token_to_text.append((word_start, word_end))
+            tokens.append("[UNK]")
+        else:
+            current_word_start = word_start
+            for word_token in word_tokens:
+                word_token_len = len(re.sub(r"^##", "", word_token))
+                index_map_from_token_to_text.append(
+                    (current_word_start, current_word_start + word_token_len)
+                )
+                current_word_start = current_word_start + word_token_len
+                tokens.append(word_token)
+
+    index_map_from_text_to_token = text2word
+    for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
+        for token_pos in range(token_start, token_end):
+            index_map_from_text_to_token[token_pos] = i
+
+    return tokens, index_map_from_text_to_token, index_map_from_token_to_text
+
+
+def _load_config(config_path: os.PathLike):
+    import importlib.util
+
+    spec = importlib.util.spec_from_file_location("__init__", config_path)
+    config = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(config)
+    return config
+
+
+default_config_dict = {
+    "manual_seed": 1313,
+    "model_source": "bert-base-chinese",
+    "window_size": 32,
+    "num_workers": 2,
+    "use_mask": True,
+    "use_char_phoneme": False,
+    "use_conditional": True,
+    "param_conditional": {
+        "affect_location": "softmax",
+        "bias": True,
+        "char-linear": True,
+        "pos-linear": False,
+        "char+pos-second": True,
+        "char+pos-second_lowrank": False,
+        "lowrank_size": 0,
+        "char+pos-second_fm": False,
+        "fm_size": 0,
+        "fix_mode": None,
+        "count_json": "train.count.json",
+    },
+    "lr": 5e-5,
+    "val_interval": 200,
+    "num_iter": 10000,
+    "use_focal": False,
+    "param_focal": {"alpha": 0.0, "gamma": 0.7},
+    "use_pos": True,
+    "param_pos ": {
+        "weight": 0.1,
+        "pos_joint_training": True,
+        "train_pos_path": "train.pos",
+        "valid_pos_path": "dev.pos",
+        "test_pos_path": "test.pos",
+    },
+}
+
+
+def load_config(config_path: os.PathLike, use_default: bool = False):
+    config = _load_config(config_path)
+    if use_default:
+        for attr, val in default_config_dict.items():
+            if not hasattr(config, attr):
+                setattr(config, attr, val)
+            elif isinstance(val, dict):
+                d = getattr(config, attr)
+                for dict_k, dict_v in val.items():
+                    if dict_k not in d:
+                        d[dict_k] = dict_v
+    return config
+
+
+def predict(
+    session, onnx_input: Dict[str, Any], labels: List[str]
+) -> Tuple[List[str], List[float]]:
+    all_preds = []
+    all_confidences = []
+    probs = session.run(
+        [],
+        {
+            "input_ids": onnx_input["input_ids"],
+            "token_type_ids": onnx_input["token_type_ids"],
+            "attention_mask": onnx_input["attention_masks"],
+            "phoneme_mask": onnx_input["phoneme_masks"],
+            "char_ids": onnx_input["char_ids"],
+            "position_ids": onnx_input["position_ids"],
+        },
+    )[0]
+
+    preds = np.argmax(probs, axis=1).tolist()
+    max_probs = []
+    for index, arr in zip(preds, probs.tolist()):
+        max_probs.append(arr[index])
+    all_preds += [labels[pred] for pred in preds]
+    all_confidences += max_probs
+
+    return all_preds, all_confidences
+
+
+class G2PWOnnxConverter:
+    def __init__(
+        self,
+        model_dir: str,
+        style: str = "bopomofo",
+        model_source: str = None,
+        enable_non_tradional_chinese: bool = False,
+    ):
+        if not is_dep_available("onnxruntime"):
+            raise RuntimeError("Please install ONNX Runtime first.")
+
+        import onnxruntime
+
+        sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = (
+            onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
+        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        sess_options.intra_op_num_threads = 2
+        self.punc = "、：，；。？！“”‘’':,;.?!"
+        self.rhy_phns = ["sp1", "sp2", "sp3", "sp4"]
+        self.session_g2pW = onnxruntime.InferenceSession(
+            os.path.join(model_dir, "g2pW.onnx"), sess_options=sess_options
+        )
+        self.config = load_config(
+            config_path=os.path.join(model_dir, "config.py"), use_default=True
+        )
+
+        self.model_source = model_source if model_source else self.config.model_source
+        self.enable_opencc = enable_non_tradional_chinese
+        self.tokenizer = BertTokenizer.from_pretrained(self.config.model_source)
+        self.vocab_phones = {}
+        self.vocab_tones = {}
+        with open(
+            os.path.join(model_dir, "phone_id_map.txt"), "rt", encoding="utf-8"
+        ) as f:
+            phn_id = [line.strip().split() for line in f.readlines()]
+        for phn, id in phn_id:
+            self.vocab_phones[phn] = int(id)
+        self.pinyin2phone = generate_lexicon(with_tone=True, with_erhua=False)
+        polyphonic_chars_path = os.path.join(model_dir, "POLYPHONIC_CHARS.txt")
+        monophonic_chars_path = os.path.join(model_dir, "MONOPHONIC_CHARS.txt")
+        self.polyphonic_chars = [
+            line.split("\t")
+            for line in open(polyphonic_chars_path, encoding="utf-8")
+            .read()
+            .strip()
+            .split("\n")
+        ]
+        self.non_polyphonic = {
+            "一",
+            "不",
+            "和",
+            "咋",
+            "嗲",
+            "剖",
+            "差",
+            "攢",
+            "倒",
+            "難",
+            "奔",
+            "勁",
+            "拗",
+            "肖",
+            "瘙",
+            "誒",
+            "泊",
+            "听",
+            "噢",
+        }
+        self.non_monophonic = {"似", "攢"}
+        self.monophonic_chars = [
+            line.split("\t")
+            for line in open(monophonic_chars_path, encoding="utf-8")
+            .read()
+            .strip()
+            .split("\n")
+        ]
+        self.labels, self.char2phonemes = (
+            get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
+            if self.config.use_char_phoneme
+            else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
+        )
+
+        self.chars = sorted(list(self.char2phonemes.keys()))
+
+        self.with_erhua = False
+        self.polyphonic_chars_new = set(self.chars)
+        for char in self.non_polyphonic:
+            if char in self.polyphonic_chars_new:
+                self.polyphonic_chars_new.remove(char)
+
+        self.monophonic_chars_dict = {
+            char: phoneme for char, phoneme in self.monophonic_chars
+        }
+        for char in self.non_monophonic:
+            if char in self.monophonic_chars_dict:
+                self.monophonic_chars_dict.pop(char)
+
+        self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
+
+        with open(
+            os.path.join(model_dir, "bopomofo_to_pinyin_wo_tune_dict.json"),
+            "r",
+            encoding="utf-8",
+        ) as fr:
+            self.bopomofo_convert_dict = json.load(fr)
+        self.style_convert_func = {
+            "bopomofo": lambda x: x,
+            "pinyin": self._convert_bopomofo_to_pinyin,
+        }[style]
+
+        with open(
+            os.path.join(model_dir, "char_bopomofo_dict.json"), "r", encoding="utf-8"
+        ) as fr:
+            self.char_bopomofo_dict = json.load(fr)
+
+        if self.enable_opencc:
+            import opencc as OpenCC
+
+            self.cc = OpenCC("s2tw")
+
+    def _pinyin2p(self, pinyins: List[str], words: List[str]) -> List[str]:
+        import jieba.posseg as psg
+
+        sub_initials = []
+        sub_finals = []
+        phones_list = []
+        for seg in words:
+            seg = re.sub("[a-zA-Z]+", "", seg)
+            # [(word, pos), ...]
+            seg_cut = psg.lcut(seg)
+            # 为了多音词获得更好的效果，这里采用整句预测
+            phones = []
+            initials = []
+            finals = []
+            pre_word_length = 0
+            for word, pos in seg_cut:
+                sub_initials = []
+                sub_finals = []
+                now_word_length = pre_word_length + len(word)
+
+                # skip english word
+                if pos == "eng":
+                    pre_word_length = now_word_length
+                    continue
+
+                word_pinyins = pinyins[pre_word_length:now_word_length]
+
+                for word_pinyin, char in zip(word_pinyins, word):
+                    if word_pinyin is None:
+                        word_pinyin = char
+
+                    word_pinyin = word_pinyin.replace("u:", "v")
+
+                    if word_pinyin in self.pinyin2phone:
+                        initial_final_list = self.pinyin2phone[word_pinyin].split(" ")
+                        if len(initial_final_list) == 2:
+                            sub_initials.append(initial_final_list[0])
+                            sub_finals.append(initial_final_list[1])
+                        elif len(initial_final_list) == 1:
+                            sub_initials.append("")
+                            sub_finals.append(initial_final_list[1])
+                    else:
+                        # If it's not pinyin (possibly punctuation) or no conversion is required
+                        sub_initials.append(word_pinyin)
+                        sub_finals.append(word_pinyin)
+
+                pre_word_length = now_word_length
+                initials.append(sub_initials)
+                finals.append(sub_finals)
+            initials = sum(initials, [])
+            finals = sum(finals, [])
+            for c, v in zip(initials, finals):
+                # NOTE: post process for pypinyin outputs
+                # we discriminate i, ii and iii
+                if c and c not in self.punc:
+                    phones.append(c)
+                # replace punctuation by `sp`
+                if c and c in self.punc:
+                    phones.append("sp")
+
+                if v and v not in self.punc and v not in self.rhy_phns:
+                    phones.append(v)
+            phones_list.append(phones)
+            return phones_list
+
+    def _p2id(self, phonemes: List[str]) -> np.ndarray:
+        """
+        Phoneme to Index
+        """
+        # replace unk phone with sp
+        phonemes = [phn if phn in self.vocab_phones else "sp" for phn in phonemes]
+        phone_ids = [self.vocab_phones[item] for item in phonemes]
+        return np.array(phone_ids, np.int64)
+
+    def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
+        tone = bopomofo[-1]
+        assert tone in "12345"
+        component = self.bopomofo_convert_dict.get(bopomofo[:-1])
+        if component:
+            return component + tone
+        else:
+            print(f'Warning: "{bopomofo}" cannot convert to pinyin')
+            return None
+
+    def __call__(self, sentences: List[str]) -> List[List[str]]:
+        if isinstance(sentences, str):
+            sentences = [sentences]
+
+        if self.enable_opencc:
+            translated_sentences = []
+            for sent in sentences:
+                translated_sent = self.cc.convert(sent)
+                assert len(translated_sent) == len(sent)
+                translated_sentences.append(translated_sent)
+            sentences = translated_sentences
+
+        texts, query_ids, sent_ids, partial_results = self._prepare_data(
+            sentences=sentences
+        )
+        if len(texts) == 0:
+            # sentences no polyphonic words
+            phones = self._pinyin2p(partial_results[0], sentences)
+            phone_ids = self._p2id(phones[0])
+            return {
+                "phones": phones[0],
+                "phone_ids": phone_ids,
+                "pinyins": partial_results[0],
+            }
+
+        onnx_input = prepare_onnx_input(
+            tokenizer=self.tokenizer,
+            labels=self.labels,
+            char2phonemes=self.char2phonemes,
+            chars=self.chars,
+            texts=texts,
+            query_ids=query_ids,
+            use_mask=self.config.use_mask,
+            window_size=None,
+        )
+
+        preds, confidences = predict(
+            session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels
+        )
+        if self.config.use_char_phoneme:
+            preds = [pred.split(" ")[1] for pred in preds]
+
+        results = partial_results
+        for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
+            results[sent_id][query_id] = self.style_convert_func(pred)
+        phones = self._pinyin2p(results[0], texts)
+        phone_ids = self._p2id(phones[0])
+        return {"phones": phones[0], "phone_ids": phone_ids, "pinyins": results[0]}
+
+    def _prepare_data(
+        self, sentences: List[str]
+    ) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
+        texts, query_ids, sent_ids, partial_results = [], [], [], []
+        for sent_id, sent in enumerate(sentences):
+            # pypinyin works well for Simplified Chinese than Traditional Chinese
+            sent_s = tranditional_to_simplified(sent)
+            pypinyin_result = pinyin(
+                sent_s, neutral_tone_with_five=True, style=Style.TONE3
+            )
+            partial_result = [None] * len(sent)
+            for i, char in enumerate(sent):
+                if char in self.polyphonic_chars_new:
+                    texts.append(sent)
+                    query_ids.append(i)
+                    sent_ids.append(sent_id)
+                elif char in self.monophonic_chars_dict:
+                    partial_result[i] = self.style_convert_func(
+                        self.monophonic_chars_dict[char]
+                    )
+                elif char in self.char_bopomofo_dict:
+                    partial_result[i] = pypinyin_result[i][0]
+                    partial_result[i] = self.style_convert_func(
+                        self.char_bopomofo_dict[char][0]
+                    )
+                else:
+                    partial_result[i] = pypinyin_result[i][0]
+
+            partial_results.append(partial_result)
+        return texts, query_ids, sent_ids, partial_results
diff --git a/paddlex/inference/models/text_to_pinyin/result.py b/paddlex/inference/models/text_to_pinyin/result.py
new file mode 100644
index 0000000000..c5132a36aa
--- /dev/null
+++ b/paddlex/inference/models/text_to_pinyin/result.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...common.result import BaseResult
+
+
+class TextToPinyinResult(BaseResult):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data)
diff --git a/paddlex/inference/models/text_to_speech_acoustic/__init__.py b/paddlex/inference/models/text_to_speech_acoustic/__init__.py
new file mode 100644
index 0000000000..2cd9f28023
--- /dev/null
+++ b/paddlex/inference/models/text_to_speech_acoustic/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .predictor import Fastspeech2Predictor
diff --git a/paddlex/inference/models/text_to_speech_acoustic/predictor.py b/paddlex/inference/models/text_to_speech_acoustic/predictor.py
new file mode 100644
index 0000000000..394ea0ef94
--- /dev/null
+++ b/paddlex/inference/models/text_to_speech_acoustic/predictor.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....modules.text_to_speech_acoustic.model_list import MODELS
+from ...common.batch_sampler import AudioBatchSampler
+from ..base import BasePredictor
+from .result import Fastspeech2Result
+
+
+class Fastspeech2Predictor(BasePredictor):
+
+    entities = MODELS
+
+    def __init__(self, *args, **kwargs):
+        """Initializes FastspeechPredictor.
+
+        Args:
+            *args: Arbitrary positional arguments passed to the superclass.
+            **kwargs: Arbitrary keyword arguments passed to the superclass.
+        """
+        super().__init__(*args, **kwargs)
+        self.infer = self.create_static_infer()
+
+    def _build_batch_sampler(self):
+        """Builds and returns an AudioBatchSampler instance.
+
+        Returns:
+            AudioBatchSampler: An instance of AudioBatchSampler.
+        """
+        return AudioBatchSampler()
+
+    def _get_result_class(self):
+        """Returns the result class, Fastspeech2Result.
+
+        Returns:
+            type: The Fastspeech2Result class.
+        """
+        return Fastspeech2Result
+
+    def process(self, batch_data):
+        """
+        Process a batch of data through the preprocessing, inference, and postprocessing.
+
+        Args:
+            batch_data (List[Union[str], ...]): A batch of input phone data.
+
+        Returns:
+            dict: A dictionary containing the input path and result. The result include the output pinyin dict.
+        """
+        phone = batch_data
+        mel = self.infer(phone)
+        return {
+            "result": mel,
+        }
diff --git a/paddlex/inference/models/text_to_speech_acoustic/result.py b/paddlex/inference/models/text_to_speech_acoustic/result.py
new file mode 100644
index 0000000000..e24c59f9c5
--- /dev/null
+++ b/paddlex/inference/models/text_to_speech_acoustic/result.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...common.result import BaseResult
+
+
+class Fastspeech2Result(BaseResult):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data)
diff --git a/paddlex/inference/models/text_to_speech_vocoder/__init__.py b/paddlex/inference/models/text_to_speech_vocoder/__init__.py
new file mode 100644
index 0000000000..cb66bfc0e1
--- /dev/null
+++ b/paddlex/inference/models/text_to_speech_vocoder/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .predictor import PwganPredictor
diff --git a/paddlex/inference/models/text_to_speech_vocoder/predictor.py b/paddlex/inference/models/text_to_speech_vocoder/predictor.py
new file mode 100644
index 0000000000..a42753b329
--- /dev/null
+++ b/paddlex/inference/models/text_to_speech_vocoder/predictor.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ....modules.text_to_speech_vocoder.model_list import MODELS
+from ...common.batch_sampler import AudioBatchSampler
+from ..base import BasePredictor
+from .result import PwganResult
+
+
+class PwganPredictor(BasePredictor):
+
+    entities = MODELS
+
+    def __init__(self, *args, **kwargs):
+        """Initializes FastspeechPredictor.
+
+        Args:
+            *args: Arbitrary positional arguments passed to the superclass.
+            **kwargs: Arbitrary keyword arguments passed to the superclass.
+        """
+        super().__init__(*args, **kwargs)
+        self.infer = self.create_static_infer()
+
+    def _build_batch_sampler(self):
+        """Builds and returns an AudioBatchSampler instance.
+
+        Returns:
+            AudioBatchSampler: An instance of AudioBatchSampler.
+        """
+        return AudioBatchSampler()
+
+    def _get_result_class(self):
+        """Returns the result class, PwganResult.
+
+        Returns:
+            type: The PwganResult class.
+        """
+        return PwganResult
+
+    def process(self, batch_data):
+        """
+        Process a batch of data through the preprocessing, inference, and postprocessing.
+
+        Args:
+            batch_data (List[Union[str], ...]): A batch of input phone data.
+
+        Returns:
+            dict: A dictionary containing the input path and result. The result include the output pinyin dict.
+        """
+        input_data = batch_data[0]
+        if type(input_data) is str:
+            mel = np.load(input_data)
+        else:
+            mel = input_data
+        wav = self.infer([mel])
+        result = np.array(wav).reshape(1, -1)
+        return {
+            "result": result,
+        }
diff --git a/paddlex/inference/models/text_to_speech_vocoder/result.py b/paddlex/inference/models/text_to_speech_vocoder/result.py
new file mode 100644
index 0000000000..eabd326e85
--- /dev/null
+++ b/paddlex/inference/models/text_to_speech_vocoder/result.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...common.result import BaseAudioResult
+
+
+class PwganResult(BaseAudioResult):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data)
+
+    def _to_audio(self):
+        return {"res": self}
diff --git a/paddlex/inference/pipelines/__init__.py b/paddlex/inference/pipelines/__init__.py
index 1ae8758244..63cda5d4b2 100644
--- a/paddlex/inference/pipelines/__init__.py
+++ b/paddlex/inference/pipelines/__init__.py
@@ -50,6 +50,7 @@
 from .semantic_segmentation import SemanticSegmentationPipeline
 from .small_object_detection import SmallObjectDetectionPipeline
 from .table_recognition import TableRecognitionPipeline, TableRecognitionPipelineV2
+from .text_to_speech import TextToSpeechPipeline
 from .ts_anomaly_detection import TSAnomalyDetPipeline
 from .ts_classification import TSClsPipeline
 from .ts_forecasting import TSFcPipeline
diff --git a/paddlex/inference/pipelines/_parallel.py b/paddlex/inference/pipelines/_parallel.py
index 75e9a2d9b7..5d6b6b102f 100644
--- a/paddlex/inference/pipelines/_parallel.py
+++ b/paddlex/inference/pipelines/_parallel.py
@@ -76,10 +76,19 @@ class AutoParallelSimpleInferencePipeline(BasePipeline):
     def __init__(
         self,
         config,
+        device=None,
+        pp_option=None,
+        use_hpip=False,
+        hpi_config=None,
         *args,
         **kwargs,
     ):
-        super().__init__(*args, **kwargs)
+        super().__init__(
+            device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_config=hpi_config
+        )
+
+        self._init_args = args
+        self._init_kwargs = kwargs
 
         self._multi_device_inference = False
         if self.device is not None:
@@ -89,7 +98,8 @@ def __init__(
                 self._pipelines = []
                 for device_id in device_ids:
                     pipeline = self._create_internal_pipeline(
-                        config, device_utils.constr_device(device_type, [device_id])
+                        config,
+                        device_utils.constr_device(device_type, [device_id]),
                     )
                     self._pipelines.append(pipeline)
                 batch_size = self._get_batch_size(config)
@@ -161,6 +171,8 @@ def _create_internal_pipeline(self, config, device):
             pp_option=self.pp_option,
             use_hpip=self.use_hpip,
             hpi_config=self.hpi_config,
+            *self._init_args,
+            **self._init_kwargs,
         )
 
     def _create_batch_sampler(self, batch_size):
diff --git a/paddlex/inference/pipelines/components/common/cal_ocr_word_box.py b/paddlex/inference/pipelines/components/common/cal_ocr_word_box.py
index f2f142bad5..4d4d254546 100644
--- a/paddlex/inference/pipelines/components/common/cal_ocr_word_box.py
+++ b/paddlex/inference/pipelines/components/common/cal_ocr_word_box.py
@@ -18,10 +18,19 @@
 
 # from .convert_points_and_boxes import convert_points_to_boxes
 
+def is_vertical_text(box: np.ndarray):
+    """Determine if the text box is vertical based on its aspect ratio."""
+
+    width = box[:,0].max() - box[:,0].min()
+    height = box[:,1].max() - box[:,1].min()
+    if (height / width) > 1.5:
+        return True
+    else:
+        return False
 
 def cal_ocr_word_box(rec_str, box, rec_word_info):
     """Calculate the detection frame for each word based on the results of recognition and detection of ocr"""
-
+    is_vertical = is_vertical_text(box)
     col_num, word_list, word_col_list, state_list = rec_word_info
     box = box.tolist()
     bbox_x_start = box[0][0]
@@ -29,50 +38,53 @@ def cal_ocr_word_box(rec_str, box, rec_word_info):
     bbox_y_start = box[0][1]
     bbox_y_end = box[2][1]
 
-    cell_width = (bbox_x_end - bbox_x_start) / col_num
+    if is_vertical:
+        cell_size = (bbox_y_end - bbox_y_start) / col_num
+        bbox_size = bbox_y_end - bbox_y_start
+        bbox_start = bbox_y_start
+        def create_box(start, end):
+            return (
+                    (bbox_x_start, start), (bbox_x_start, end),
+                    (bbox_x_end, end), (bbox_x_end, start)
+                )
+    else:
+        cell_size = (bbox_x_end - bbox_x_start) / col_num
+        bbox_size = bbox_x_end - bbox_x_start
+        bbox_start = bbox_x_start
+        def create_box(start, end):
+            return (
+                    (start, bbox_y_start), (end, bbox_y_start),
+                    (end, bbox_y_end), (start, bbox_y_end)
+                )
 
     word_box_list = []
     word_box_content_list = []
-    cn_width_list = []
+    cn_size_list = []
     cn_col_list = []
+
+    # Process words
     for word, word_col, state in zip(word_list, word_col_list, state_list):
         if state == "cn":
             if len(word_col) != 1:
-                char_seq_length = (word_col[-1] - word_col[0] + 1) * cell_width
-                char_width = char_seq_length / (len(word_col) - 1)
-                cn_width_list.append(char_width)
+                char_seq_length = (word_col[-1] - word_col[0] + 1) * cell_size
+                char_size = char_seq_length / (len(word_col) - 1)
+                cn_size_list.append(char_size)
             cn_col_list += word_col
             word_box_content_list += word
         else:
-            cell_x_start = bbox_x_start + int(word_col[0] * cell_width)
-            cell_x_end = bbox_x_start + int((word_col[-1] + 1) * cell_width)
-            cell = (
-                (cell_x_start, bbox_y_start),
-                (cell_x_end, bbox_y_start),
-                (cell_x_end, bbox_y_end),
-                (cell_x_start, bbox_y_end),
-            )
-            word_box_list.append(cell)
+            cell_start = bbox_start + int(word_col[0] * cell_size)
+            cell_end = bbox_start + int((word_col[-1] + 1) * cell_size)
+            word_box_list.append(create_box(cell_start, cell_end))
             word_box_content_list.append("".join(word))
+
     if len(cn_col_list) != 0:
-        if len(cn_width_list) != 0:
-            avg_char_width = np.mean(cn_width_list)
-        else:
-            avg_char_width = (bbox_x_end - bbox_x_start) / len(rec_str)
+        avg_char_size = np.mean(cn_size_list) if cn_size_list else bbox_size / len(rec_str)
         for center_idx in cn_col_list:
-            center_x = (center_idx + 0.5) * cell_width
-            cell_x_start = max(int(center_x - avg_char_width / 2), 0) + bbox_x_start
-            cell_x_end = (
-                min(int(center_x + avg_char_width / 2), bbox_x_end - bbox_x_start)
-                + bbox_x_start
-            )
-            cell = (
-                (cell_x_start, bbox_y_start),
-                (cell_x_end, bbox_y_start),
-                (cell_x_end, bbox_y_end),
-                (cell_x_start, bbox_y_end),
-            )
-            word_box_list.append(cell)
+            center = (center_idx + 0.5) * cell_size
+            cell_start = max(int(center - avg_char_size / 2), 0) + bbox_start
+            cell_end = min(int(center + avg_char_size / 2), bbox_size) + bbox_start
+            word_box_list.append(create_box(cell_start, cell_end))
+
     word_box_list = sort_boxes(word_box_list, y_thresh=12)
     return word_box_content_list, word_box_list
 
diff --git a/paddlex/inference/pipelines/components/common/crop_image_regions.py b/paddlex/inference/pipelines/components/common/crop_image_regions.py
index 561494ec79..f8dcef23d8 100644
--- a/paddlex/inference/pipelines/components/common/crop_image_regions.py
+++ b/paddlex/inference/pipelines/components/common/crop_image_regions.py
@@ -37,7 +37,9 @@ def __init__(self) -> None:
         """Initializes the class."""
         super().__init__()
 
-    def __call__(self, img: np.ndarray, boxes: List[dict]) -> List[dict]:
+    def __call__(
+        self, img: np.ndarray, boxes: List[dict], layout_shape_mode="auto"
+    ) -> List[dict]:
         """
         Process the input image and bounding boxes to produce a list of cropped images
         with their corresponding bounding box coordinates and labels.
@@ -48,19 +50,33 @@ def __call__(self, img: np.ndarray, boxes: List[dict]) -> List[dict]:
                 information including 'cls_id' (class ID), 'coordinate' (bounding box
                 coordinates as a list or tuple, left, top, right, bottom),
                 and optionally 'label' (label text).
+            use_layout_mask (bool, optional): Whether to use layout mask. Defaults to False.
 
         Returns:
             list[dict]: A list of dictionaries, each containing a cropped image ('img'),
                 the original bounding box coordinates ('box'), and the label ('label').
         """
         output_list = []
-        for bbox_info in boxes:
-            label_id = bbox_info["cls_id"]
-            box = bbox_info["coordinate"]
-            label = bbox_info.get("label", label_id)
+        for box_info in boxes:
+            label_id = box_info["cls_id"]
+            box = box_info["coordinate"]
+            label = box_info.get("label", label_id)
             xmin, ymin, xmax, ymax = [int(i) for i in box]
             img_crop = img[ymin:ymax, xmin:xmax].copy()
-            output_list.append({"img": img_crop, "box": box, "label": label})
+            out_info = {"img": img_crop, "box": box, "label": label}
+            if layout_shape_mode != "rect" and "polygon_points" in box_info:
+                mask = np.zeros(img_crop.shape[:2], dtype=np.int32)
+                polygon = np.array(box_info["polygon_points"], dtype=np.int32)
+                polygon = polygon.reshape((-1, 1, 2))
+                if polygon is not None and len(polygon) > 0:
+                    polygon = polygon - np.array([xmin, ymin])
+                cv2.fillPoly(mask, [polygon], 1)
+                mask = mask.astype(bool)
+                img_crop[~mask] = 255
+                out_info["img"] = img_crop
+                out_info["polygon_points"] = box_info["polygon_points"]
+
+            output_list.append(out_info)
         return output_list
 
 
diff --git a/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py b/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py
index 4788663b23..f6bc00f96e 100644
--- a/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py
+++ b/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py
@@ -167,7 +167,7 @@ def generate_prompt(
 
         after_rule = "9. 请在翻译完成后添加特殊标记 <<END>>，确保翻译完整。"
         prompt = f"""{task_description}{rules_str}{after_rule}{output_format}{few_shot_demo_text_content}{few_shot_demo_key_value_list}"""
-        
+
         language_name = language_map.get(language, language)
         task_type = self.task_type
         if task_type == "translate_prompt":
diff --git a/paddlex/inference/pipelines/layout_parsing/layout_objects.py b/paddlex/inference/pipelines/layout_parsing/layout_objects.py
index ca6f8fe22b..6f3e79f784 100644
--- a/paddlex/inference/pipelines/layout_parsing/layout_objects.py
+++ b/paddlex/inference/pipelines/layout_parsing/layout_objects.py
@@ -379,7 +379,7 @@ def format_line(
 class LayoutBlock(object):
     """Layout Block Class"""
 
-    def __init__(self, label, bbox, content="") -> None:
+    def __init__(self, label, bbox, content="", group_id=None) -> None:
         """
         Initialize a LayoutBlock object.
 
@@ -392,6 +392,7 @@ def __init__(self, label, bbox, content="") -> None:
         self.order_label = None
         self.bbox = list(map(int, bbox))
         self.content = content
+        self.group_id = group_id
         self.seg_start_coordinate = float("inf")
         self.seg_end_coordinate = float("-inf")
         self.width = bbox[2] - bbox[0]
diff --git a/paddlex/inference/pipelines/layout_parsing/merge_table.py b/paddlex/inference/pipelines/layout_parsing/merge_table.py
new file mode 100644
index 0000000000..5384b61ef6
--- /dev/null
+++ b/paddlex/inference/pipelines/layout_parsing/merge_table.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Acknowledgement: The logic for table merging in this function is adapted from MinerU.
+
+
+def full_to_half(text: str) -> str:
+    result = []
+    for char in text:
+        code = ord(char)
+        if 0xFF01 <= code <= 0xFF5E:
+            result.append(chr(code - 0xFEE0))
+        else:
+            result.append(char)
+    return "".join(result)
+
+
+def calculate_table_total_columns(soup):
+    """
+    calculate total columns including colspan and rowspan, accounting for merged cells
+    """
+    rows = soup.find_all("tr")
+    if not rows:
+        return 0
+    max_cols = 0
+    occupied = {}
+    for row_idx, row in enumerate(rows):
+        col_idx = 0
+        cells = row.find_all(["td", "th"])
+        if row_idx not in occupied:
+            occupied[row_idx] = {}
+        for cell in cells:
+            while col_idx in occupied[row_idx]:
+                col_idx += 1
+            colspan = int(cell.get("colspan", 1))
+            rowspan = int(cell.get("rowspan", 1))
+            for r in range(row_idx, row_idx + rowspan):
+                if r not in occupied:
+                    occupied[r] = {}
+                for c in range(col_idx, col_idx + colspan):
+                    occupied[r][c] = True
+            col_idx += colspan
+            max_cols = max(max_cols, col_idx)
+    return max_cols
+
+
+def calculate_row_columns(row):
+    """
+    Calculate the actual number of columns in a single row
+    """
+    return sum(int(cell.get("colspan", 1)) for cell in row.find_all(["td", "th"]))
+
+
+def calculate_visual_columns(row):
+    """
+    Calculate the visual number of columns in a single row, excluding colspan (merged cells count as one)
+    """
+    return len(row.find_all(["td", "th"]))
+
+
+def detect_table_headers(soup1, soup2, max_header_rows=5):
+    """
+    Determine how many identical rows exist at the beginning of two tables
+    """
+    rows1 = soup1.find_all("tr")
+    rows2 = soup2.find_all("tr")
+    # Check only the minimum number of rows
+    min_rows = min(len(rows1), len(rows2), max_header_rows)
+    header_rows = 0
+    headers_match = True
+    for i in range(min_rows):
+        cells1 = rows1[i].find_all(["td", "th"])
+        cells2 = rows2[i].find_all(["td", "th"])
+        if len(cells1) != len(cells2):
+            headers_match = header_rows > 0
+            break
+        # If column counts match, check if content is identical
+        match = True
+        for c1, c2 in zip(cells1, cells2):
+            text1 = "".join(full_to_half(c1.get_text()).split())
+            text2 = "".join(full_to_half(c2.get_text()).split())
+            if text1 != text2 or int(c1.get("colspan", 1)) != int(c2.get("colspan", 1)):
+                match = False
+                break
+        # Complete match, increment matched row count. Otherwise, stop matching.
+        if match:
+            header_rows += 1
+        else:
+            headers_match = header_rows > 0
+            break
+    if header_rows == 0:
+        headers_match = False
+    return header_rows, headers_match
+
+
+def check_rows_match(soup1, soup2):
+    rows1 = soup1.find_all("tr")
+    rows2 = soup2.find_all("tr")
+    if not rows1 or not rows2:
+        return False
+    last_row = rows1[-1]
+    header_count, _ = detect_table_headers(soup1, soup2)
+    first_data_row = rows2[header_count] if len(rows2) > header_count else None
+    if not first_data_row:
+        return False
+    last_cols = calculate_row_columns(last_row)
+    first_cols = calculate_row_columns(first_data_row)
+    last_visual = calculate_visual_columns(last_row)
+    first_visual = calculate_visual_columns(first_data_row)
+    return last_cols == first_cols or last_visual == first_visual
+
+
+def is_skippable(block, allowed_labels):
+
+    continue_keywords = ["continue", "continued", "cont'd", "续", "cont‘d", "續"]
+
+    if block.label in allowed_labels:
+        return True
+
+    b_text = str(getattr(block, "text", "") or "").lower()
+    b_fig_title = str(getattr(block, "figure_title", "") or "").lower()
+    b_doc_title = str(getattr(block, "doc_title", "") or "").lower()
+    b_para_title = str(getattr(block, "paragraph_title", "") or "").lower()
+
+    full_content = f"{b_text} {b_fig_title} {b_doc_title} {b_para_title}"
+
+    if any(kw in full_content for kw in continue_keywords):
+        return True
+
+    return False
+
+
+def can_merge_tables(prev_page, prev_block, curr_page, curr_block):
+
+    from bs4 import BeautifulSoup
+
+    x0, y0, x1, y1 = prev_block.bbox
+    prev_width = x1 - x0
+    x2, y2, x3, y4 = curr_block.bbox
+    curr_width = x3 - x2
+    if curr_width == 0 or prev_width == 0:
+        return False, None, None
+    if abs(curr_width - prev_width) / min(curr_width, prev_width) >= 0.1:
+        return False, None, None
+
+    prev_index = prev_page.index(prev_block)
+    allowed_follow = all(
+        b.label
+        in ["footer", "vision_footnote", "number", "footnote", "footer_image", "seal"]
+        for b in prev_page[prev_index + 1 :]
+    )
+    if not allowed_follow:
+        return False, None, None
+
+    curr_index = curr_page.index(curr_block)
+    curr_allowed_labels = ["header", "header_image", "number", "seal"]
+
+    allowed_before = all(
+        is_skippable(b, curr_allowed_labels) for b in curr_page[:curr_index]
+    )
+    if not allowed_before:
+        return False, None, None
+
+    html_prev = prev_block.content
+    html_curr = curr_block.content
+    if not html_prev or not html_curr:
+        return False, None, None
+    soup_prev = BeautifulSoup(html_prev, "html.parser")
+    soup_curr = BeautifulSoup(html_curr, "html.parser")
+
+    total_cols_prev = calculate_table_total_columns(soup_prev)
+    total_cols_curr = calculate_table_total_columns(soup_curr)
+    tables_match = total_cols_prev == total_cols_curr
+    rows_match = check_rows_match(soup_prev, soup_curr)
+
+    return (tables_match or rows_match), soup_prev, soup_curr
+
+
+def perform_table_merge(soup_prev, soup_curr):
+    header_count, _ = detect_table_headers(soup_prev, soup_curr)
+    rows_prev = soup_prev.find_all("tr")
+    rows_curr = soup_curr.find_all("tr")
+    for row in rows_curr[header_count:]:
+        row.extract()
+        rows_prev[-1].parent.append(row)
+    return str(soup_prev)
+
+
+def merge_tables_across_pages(pages):
+    for i in range(len(pages) - 1, 0, -1):
+        page_curr = pages[i]
+        page_prev = pages[i - 1]
+
+        for block in page_curr:
+            if block.label == "table":
+                curr_block = block
+                break
+        else:
+            curr_block = None
+
+        for block in reversed(page_prev):
+            if block.label == "table":
+                prev_block = block
+                break
+        else:
+            prev_block = None
+
+        # both curr_block and prev_block should not be None
+        if curr_block and prev_block:
+            can_merge, soup_prev, soup_curr = can_merge_tables(
+                page_prev, prev_block, page_curr, curr_block
+            )
+        else:
+            can_merge = False
+
+        if can_merge:
+            merged_html = perform_table_merge(soup_prev, soup_curr)
+            prev_block.content = merged_html
+            prev_block_global_id = prev_block.global_block_id
+            curr_block.content = ""
+            curr_block.global_group_id = prev_block_global_id
+    all_blocks = [block for page in pages for block in page]
+    for page in pages:
+        for block in page:
+            if block.global_block_id != block.global_group_id:
+                block.global_group_id = all_blocks[
+                    block.global_group_id
+                ].global_group_id
+    return pages
diff --git a/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py b/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
index 5efa719107..198013bfdf 100644
--- a/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
+++ b/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
@@ -31,6 +31,7 @@
 from .._parallel import AutoParallelImageSimpleInferencePipeline
 from ..base import BasePipeline
 from ..ocr.result import OCRResult
+from ..pp_doctranslation.result import MarkdownResult
 from .layout_objects import LayoutBlock, LayoutRegion
 from .result_v2 import LayoutParsingResultV2
 from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, REGION_SETTINGS
@@ -41,6 +42,7 @@
     convert_formula_res_to_ocr_format,
     gather_imgs,
     get_bbox_intersection,
+    get_seg_flag,
     get_sub_regions_ocr_res,
     remove_overlap_blocks,
     shrink_supplement_region_bbox,
@@ -60,6 +62,7 @@ def __init__(
         pp_option: PaddlePredictorOption = None,
         use_hpip: bool = False,
         hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+        initial_predictor: bool = True,
     ) -> None:
         """Initializes the layout parsing pipeline.
 
@@ -72,6 +75,7 @@ def __init__(
             hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
                 The default high-performance inference configuration dictionary.
                 Defaults to None.
+            initial_predictor (bool, optional): Whether to initialize predictors.
         """
 
         super().__init__(
@@ -81,7 +85,8 @@ def __init__(
             hpi_config=hpi_config,
         )
 
-        self.inintial_predictor(config)
+        if initial_predictor:
+            self.inintial_predictor(config)
 
         self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
         self.img_reader = ReadImage(format="BGR")
@@ -215,6 +220,18 @@ def inintial_predictor(self, config: dict) -> None:
         self.chart_recognition_model = self.create_model(
             chart_recognition_config,
         )
+        self.markdown_ignore_labels = config.get(
+            "markdown_ignore_labels",
+            [
+                "number",
+                "footnote",
+                "header",
+                "header_image",
+                "footer",
+                "footer_image",
+                "aside_text",
+            ],
+        )
 
         return
 
@@ -791,6 +808,7 @@ def get_layout_parsing_res(
         chart_res_list: list,
         formula_res_list: list,
         text_rec_score_thresh: Union[float, None] = None,
+        markdown_ignore_labels: List[str] = [],
     ) -> list:
         """
         Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
@@ -836,9 +854,14 @@ def get_layout_parsing_res(
         parsing_res_list = self.sort_layout_parsing_blocks(layout_parsing_page)
 
         order_index = 1
+        visualize_order_labels = [
+            label
+            for label in BLOCK_LABEL_MAP["visualize_index_labels"]
+            if label not in markdown_ignore_labels
+        ]
         for index, block in enumerate(parsing_res_list):
             block.index = index
-            if block.label in BLOCK_LABEL_MAP["visualize_index_labels"]:
+            if block.label in visualize_order_labels:
                 block.order_index = order_index
                 order_index += 1
 
@@ -854,6 +877,7 @@ def get_model_settings(
         use_chart_recognition: Union[bool, None],
         use_region_detection: Union[bool, None],
         format_block_content: Union[bool, None],
+        markdown_ignore_labels: Optional[list[str]] = None,
     ) -> dict:
         """
         Get the model settings based on the provided parameters or default values.
@@ -896,6 +920,9 @@ def get_model_settings(
         if format_block_content is None:
             format_block_content = self.format_block_content
 
+        if markdown_ignore_labels is None:
+            markdown_ignore_labels = self.markdown_ignore_labels
+
         return dict(
             use_doc_preprocessor=use_doc_preprocessor,
             use_seal_recognition=use_seal_recognition,
@@ -904,6 +931,7 @@ def get_model_settings(
             use_chart_recognition=use_chart_recognition,
             use_region_detection=use_region_detection,
             format_block_content=format_block_content,
+            markdown_ignore_labels=markdown_ignore_labels,
         )
 
     def predict(
@@ -940,6 +968,7 @@ def predict(
         use_ocr_results_with_table_cells: bool = True,
         use_e2e_wired_table_rec_model: bool = False,
         use_e2e_wireless_table_rec_model: bool = True,
+        markdown_ignore_labels: Optional[list[str]] = None,
         **kwargs,
     ) -> LayoutParsingResultV2:
         """
@@ -982,6 +1011,7 @@ def predict(
             use_ocr_results_with_table_cells (bool): Whether to use OCR results processed by table cells.
             use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
             use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
+            markdown_ignore_labels (Optional[list[str]]): The list of ignored markdown labels. Default is None.
             **kwargs (Any): Additional settings to extend functionality.
 
         Returns:
@@ -996,6 +1026,7 @@ def predict(
             use_chart_recognition,
             use_region_detection,
             format_block_content,
+            markdown_ignore_labels,
         )
 
         if not self.check_model_settings_valid(model_settings):
@@ -1204,6 +1235,7 @@ def predict(
             for (
                 input_path,
                 page_index,
+                page_count,
                 doc_preprocessor_image,
                 doc_preprocessor_res,
                 layout_det_res,
@@ -1216,6 +1248,7 @@ def predict(
             ) in zip(
                 batch_data.input_paths,
                 batch_data.page_indexes,
+                batch_data.page_counts,
                 doc_preprocessor_images,
                 doc_preprocessor_results,
                 layout_det_results,
@@ -1252,6 +1285,7 @@ def predict(
                     chart_res_list=chart_res_list,
                     formula_res_list=formula_res_list,
                     text_rec_score_thresh=text_rec_score_thresh,
+                    markdown_ignore_labels=model_settings["markdown_ignore_labels"],
                 )
 
                 for formula_res in formula_res_list:
@@ -1263,6 +1297,9 @@ def predict(
                 single_img_res = {
                     "input_path": input_path,
                     "page_index": page_index,
+                    "page_count": page_count,
+                    "width": doc_preprocessor_image.shape[1],
+                    "height": doc_preprocessor_image.shape[0],
                     "doc_preprocessor_res": doc_preprocessor_res,
                     "layout_det_res": layout_det_res,
                     "region_det_res": region_det_res,
@@ -1330,7 +1367,70 @@ def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
                 page_last_element_paragraph_end_flag
             )
 
-        return markdown_texts
+        markdown_result = {"markdown_texts": markdown_texts}
+
+        return MarkdownResult(markdown_result)
+
+    def merge_text_across_page(self, blocks_by_page):
+
+        merged_blocks_by_page = []
+
+        global_prev_block = None
+
+        global_block_id = 0
+
+        for page_index, one_page_blocks in enumerate(blocks_by_page):
+            current_page_new_blocks = []
+
+            prev_block = None
+
+            for block in one_page_blocks:
+
+                setattr(block, "group_id", global_block_id)
+
+                seg_start_flag, seg_end_flag = get_seg_flag(block, prev_block)
+
+                prev_block = block
+
+                is_text = block.label == "text"
+                prev_is_text = (
+                    global_prev_block is not None and global_prev_block.label == "text"
+                )
+
+                if is_text and prev_is_text and not seg_start_flag:
+
+                    prev_text = global_prev_block.content
+                    curr_text = block.content
+
+                    last_char = prev_text[-1] if prev_text else ""
+                    first_char = curr_text[0] if curr_text else ""
+
+                    is_last_chinese = re.match(r"[\u4e00-\u9fff]", last_char)
+                    is_first_chinese = re.match(r"[\u4e00-\u9fff]", first_char)
+
+                    separator = ""
+                    if (
+                        not (is_last_chinese or is_first_chinese)
+                        and last_char
+                        and first_char
+                    ):
+                        separator = " "
+
+                    global_prev_block.content += separator + curr_text
+
+                    setattr(block, "group_id", global_prev_block.group_id)
+
+                else:
+                    # after merge, block don't add to current page
+                    current_page_new_blocks.append(block)
+
+                    global_prev_block = block
+
+                global_block_id += 1
+
+            merged_blocks_by_page.append(current_page_new_blocks)
+
+        return merged_blocks_by_page
 
 
 @pipeline_requires_extra("ocr")
diff --git a/paddlex/inference/pipelines/layout_parsing/result_v2.py b/paddlex/inference/pipelines/layout_parsing/result_v2.py
index 493035a4bf..02e3621373 100644
--- a/paddlex/inference/pipelines/layout_parsing/result_v2.py
+++ b/paddlex/inference/pipelines/layout_parsing/result_v2.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import annotations
 
 import copy
@@ -25,6 +26,7 @@
 from ....utils.fonts import PINGFANG_FONT
 from ...common.result import (
     BaseCVResult,
+    BaseResult,
     HtmlMixin,
     JsonMixin,
     LatexMixin,
@@ -42,7 +44,7 @@ def compile_title_pattern():
         r"(?:" + r"[1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?|" + r"[\(\（](?:[1-9][0-9]*|["
         r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+)[\)\）]|" + r"["
         r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+"
-        r"[、\.]?|" + r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)\.?" + r")"
+        r"[、\.]?|" + r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\s)" + r")"
     )
     return re.compile(r"^\s*(" + numbering_pattern + r")(\s*)(.*)$")
 
@@ -83,41 +85,69 @@ def format_title_func(block):
     )
 
 
-def format_centered_by_html(string):
-    return (
-        f'<div style="text-align: center;">{string}</div>'.replace(
-            "-\n",
-            "",
-        ).replace("\n", " ")
-        + "\n"
+def format_para_title_func(block):
+    """
+    Normalize chapter title.
+    Add the '#' to indicate the level of the title.
+    If numbering exists, ensure there's exactly one space between it and the title content.
+    If numbering does not exist, return the original title unchanged.
+
+    :param title: Original chapter title string.
+    :return: Normalized chapter title string.
+    """
+    if not hasattr(block, "title_level"):
+        return format_title_func(block)
+    level = block.title_level
+    title = block.content
+    return f"#{'#' * level} {title}".replace("-\n", "").replace(
+        "\n",
+        " ",
     )
 
 
+def format_centered_by_html(string, remove_symbol=True):
+    if remove_symbol:
+        string = string.replace("-\n", "").replace("\n", " ")
+    return f'<div style="text-align: center;">{string}</div>' + "\n"
+
+
 def format_text_plain_func(block):
     return block.content
 
 
-def format_image_scaled_by_html_func(block, original_image_width):
+def format_image_scaled_by_html_func(
+    block, original_image_width, show_ocr_content=False
+):
     img_tags = []
+    if block.image is None:
+        return ""
     image_path = block.image["path"]
-    image_width = block.image["img"].width
+    image_width = block.bbox[2] - block.bbox[0]
     scale = int(image_width / original_image_width * 100)
     img_tags.append(
         '<img src="{}" alt="Image" width="{}%" />'.format(
             image_path.replace("-\n", "").replace("\n", " "), scale
         ),
     )
-    return "\n".join(img_tags)
+    image_info = "\n".join(img_tags)
+    if show_ocr_content:
+        ocr_content = block.content
+        image_info += "\n\n" + ocr_content + "\n\n"
+    return image_info
 
 
-def format_image_plain_func(block):
+def format_image_plain_func(block, show_ocr_content=False):
     img_tags = []
     if block.image:
         image_path = block.image["path"]
         img_tags.append(
             "![]({})".format(image_path.replace("-\n", "").replace("\n", " "))
         )
-        return "\n".join(img_tags)
+        image_info = "\n".join(img_tags)
+        if show_ocr_content:
+            ocr_content = block.content
+            image_info += "\n\n" + ocr_content + "\n\n"
+        return image_info
     return ""
 
 
@@ -239,6 +269,9 @@ def _to_str(self, *args, **kwargs) -> dict[str, str]:
         data = {}
         data["input_path"] = self["input_path"]
         data["page_index"] = self["page_index"]
+        data["page_count"] = self["page_count"]
+        data["width"] = self["width"]
+        data["height"] = self["height"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
         parsing_res_list: List[LayoutBlock] = self["parsing_res_list"]
@@ -326,7 +359,7 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
                 format_formula_func = format_image_func
 
             handle_funcs_dict = {
-                "paragraph_title": format_title_func,
+                "paragraph_title": format_para_title_func,
                 "abstract_title": format_title_func,
                 "reference_title": format_title_func,
                 "content_title": format_title_func,
@@ -364,11 +397,23 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
                 ),
                 "algorithm": lambda block: block.content.strip("\n"),
                 "seal": format_seal_func,
+                "number": format_text_plain_func,
+                "footnote": format_text_plain_func,
+                "header": format_text_plain_func,
+                "header_image": format_image_plain_func,
+                "footer": format_text_plain_func,
+                "footer_image": format_image_plain_func,
+                "aside_text": format_text_plain_func,
             }
+            for label in self["model_settings"].get("markdown_ignore_labels", []):
+                handle_funcs_dict.pop(label, None)
 
         data = {}
         data["input_path"] = self["input_path"]
         data["page_index"] = self["page_index"]
+        data["page_count"] = self["page_count"]
+        data["width"] = self["width"]
+        data["height"] = self["height"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
         parsing_res_list: List[LayoutBlock] = self["parsing_res_list"]
@@ -541,7 +586,16 @@ def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
             ),
             "algorithm": lambda block: block.content.strip("\n"),
             "seal": format_seal_func,
+            "number": format_text_plain_func,
+            "footnote": format_text_plain_func,
+            "header": format_text_plain_func,
+            "header_image": format_image_plain_func,
+            "footer": format_text_plain_func,
+            "footer_image": format_image_plain_func,
+            "aside_text": format_text_plain_func,
         }
+        for label in self["model_settings"].get("markdown_ignore_labels", []):
+            handle_funcs_dict.pop(label, None)
 
         markdown_content = ""
         last_label = None
@@ -731,3 +785,159 @@ def _to_latex(self) -> dict:
             "images": image,
             "input_path": self["input_path"],
         }
+
+
+class ProcessedLayoutParsingResult(BaseResult, MarkdownMixin):
+
+    def __init__(self, data) -> None:
+        """Initializes a new instance of the class with the specified data."""
+        super().__init__(data)
+        MarkdownMixin.__init__(self)
+
+    def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
+        """
+        Save the parsing result to a Markdown file.
+
+        Args:
+            pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
+
+        Returns:
+            Dict
+        """
+
+        self["model_settings"] = self["model_settings"][0]
+        self["input_path"] = self["input_path"][0]
+        self["doc_preprocessor_res"] = self["doc_preprocessor_res"][0]
+        self["page_index"] = None
+        self["width"] = None
+        self["height"] = None
+        self["page_count"] = None
+
+        original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+
+        if pretty:
+            format_text_func = lambda block: format_centered_by_html(
+                format_text_plain_func(block)
+            )
+            format_image_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                )
+            )
+        else:
+            format_text_func = lambda block: block.content
+            format_image_func = format_image_plain_func
+
+        if self["model_settings"].get("use_chart_recognition", False):
+            format_chart_func = format_chart2table_func
+        else:
+            format_chart_func = format_image_func
+
+        if self["model_settings"].get("use_seal_recognition", False):
+            format_seal_func = lambda block: "\n".join(
+                [format_image_func(block), format_text_func(block)]
+            )
+        else:
+            format_seal_func = format_image_func
+
+        if self["model_settings"].get("use_table_recognition", False):
+            if pretty:
+                format_table_func = lambda block: "\n" + format_text_func(
+                    block
+                ).replace("<table>", '<table border="1">')
+            else:
+                format_table_func = lambda block: simplify_table_func(
+                    "\n" + block.content
+                )
+        else:
+            format_table_func = format_image_func
+
+        if self["model_settings"].get("use_formula_recognition", False):
+            format_formula_func = lambda block: f"$${block.content}$$"
+        else:
+            format_formula_func = format_image_func
+
+        handle_funcs_dict = {
+            "paragraph_title": format_para_title_func,
+            "abstract_title": format_title_func,
+            "reference_title": format_title_func,
+            "content_title": format_title_func,
+            "doc_title": lambda block: f"# {block.content}".replace(
+                "-\n",
+                "",
+            ).replace("\n", " "),
+            "table_title": format_text_func,
+            "figure_title": format_text_func,
+            "chart_title": format_text_func,
+            "vision_footnote": lambda block: block.content.replace(
+                "\n\n", "\n"
+            ).replace("\n", "\n\n"),
+            "text": lambda block: block.content.replace("\n\n", "\n").replace(
+                "\n", "\n\n"
+            ),
+            "abstract": partial(
+                format_first_line_func,
+                templates=["摘要", "abstract"],
+                format_func=lambda l: f"## {l}\n",
+                spliter=" ",
+            ),
+            "content": lambda block: block.content.replace("-\n", "  \n").replace(
+                "\n", "  \n"
+            ),
+            "image": format_image_func,
+            "chart": format_chart_func,
+            "formula": format_formula_func,
+            "table": format_table_func,
+            "reference": partial(
+                format_first_line_func,
+                templates=["参考文献", "references"],
+                format_func=lambda l: f"## {l}",
+                spliter="\n",
+            ),
+            "algorithm": lambda block: block.content.strip("\n"),
+            "seal": format_seal_func,
+        }
+        for label in self["model_settings"].get("markdown_ignore_labels", []):
+            handle_funcs_dict.pop(label, None)
+
+        markdown_content = ""
+        markdown_info = {}
+        markdown_info["markdown_images"] = {}
+        pages_list = self["parsing_res_list"]
+        global_block_id = 0
+
+        for page_blocks in pages_list:
+
+            if not page_blocks:
+                continue
+            for idx, block in enumerate(page_blocks):
+
+                label = block.label
+
+                if block.image is not None:
+                    markdown_info["markdown_images"][block.image["path"]] = block.image[
+                        "img"
+                    ]
+
+                handle_func = handle_funcs_dict.get(label, None)
+
+                if handle_func:
+                    current_text = handle_func(block)
+
+                    if markdown_content:
+                        markdown_content += "\n\n" + current_text
+                    else:
+                        markdown_content += current_text
+
+                if block.group_id is None:
+                    block.group_id = global_block_id
+                global_block_id += 1
+
+        markdown_info["page_index"] = self["page_index"]
+        markdown_info["input_path"] = self["input_path"]
+        markdown_info["markdown_texts"] = markdown_content
+        for img in self["imgs_in_doc"]:
+            markdown_info["markdown_images"][img["path"]] = img["img"]
+
+        return markdown_info
diff --git a/paddlex/inference/pipelines/layout_parsing/setting.py b/paddlex/inference/pipelines/layout_parsing/setting.py
index 3e42459d22..c85d39c3be 100644
--- a/paddlex/inference/pipelines/layout_parsing/setting.py
+++ b/paddlex/inference/pipelines/layout_parsing/setting.py
@@ -83,6 +83,13 @@
         "abstract_title",
         "refer_title",
         "content_title",
+        "number",
+        "footnote",
+        "header",
+        "header_image",
+        "footer",
+        "footer_image",
+        "aside_text",
     ],
-    "image_labels": ["image", "figure"],
+    "image_labels": ["image", "figure", "seal"],
 }
diff --git a/paddlex/inference/pipelines/layout_parsing/title_level.py b/paddlex/inference/pipelines/layout_parsing/title_level.py
new file mode 100644
index 0000000000..138e8245cc
--- /dev/null
+++ b/paddlex/inference/pipelines/layout_parsing/title_level.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from collections import Counter
+
+import numpy as np
+
+# Regular expressions for detecting heading numbering styles
+SYMBOL_PATTERNS = {
+    # Matches Roman numerals: I, II, V, X, i., iv), V.
+    "ROMAN": re.compile(r"^\s*([IVX]+)(?:[\.．\)\s]|$)", flags=re.I),
+    # Matches a single letter: A., B), c., D
+    "LETTER": re.compile(r"^\s*([A-Z])(?:[\.．\)\s])", flags=re.I),
+    # Matches multi-level numeric numbering: 1, 1.1, 1.2.3, 2.
+    "NUM_LIST": re.compile(r"^\s*(\d+(?:\.\d+)*)(?![）)])(?:[\.]?\s*|(?=[A-Z]))"),
+    # Matches numeric numbering enclosed in parentheses: (1), (1.1), （2）, （2.3）, 1)
+    "NUM_LIST_WITH_BRACKET": re.compile(r"^\s*(?:[\(（])?(\d+(?:\.\d+)*)[\)）]"),
+    # Matches Chinese numerals: 一 , 二 , 第一 , 十三
+    "CHINESE_NUM": re.compile(
+        r"^\s*"
+        r"(?:第|[（\(])?"
+        r"([一二三四五六七八九十]{1,2})"
+        r"(?:"
+        r"[章节篇卷部条题讲课回）\)]"
+        r"|"
+        r"(?![a-zA-Z\u4e00-\u9fa5])"
+        r")",
+        flags=re.I,
+    ),
+}
+
+
+def get_symbol_and_level(content: str):
+    """
+    Extract numbering type and its semantic level
+    """
+    txt = str(content).strip()
+
+    if SYMBOL_PATTERNS["NUM_LIST_WITH_BRACKET"].match(txt):
+        return "NUM_LIST_BRACKET", 4
+
+    if SYMBOL_PATTERNS["ROMAN"].match(txt):
+        return "ROMAN", 1
+
+    if SYMBOL_PATTERNS["CHINESE_NUM"].match(txt):
+        return "CHINESE_NUM", 1
+
+    if SYMBOL_PATTERNS["LETTER"].match(txt):
+        return "LETTER", 2
+
+    if SYMBOL_PATTERNS["NUM_LIST"].match(txt):
+        content = SYMBOL_PATTERNS["NUM_LIST"].match(txt).group(1)
+        level = content.count(".") + 1
+        return "NUM_LIST", level
+
+    return None, -1
+
+
+# Special keywords that should be treated as level-1 headings
+SPECIAL_KEYWORDS = {
+    "ABSTRACT": 1,
+    "SUMMARY": 1,
+    "RESUME": 1,
+    "绪论": 1,
+    "引言": 1,
+    "CONTENTS": 1,
+    "REFERENCES": 1,
+    "REFERENCE": 1,
+    "参考文献": 1,
+    "APPENDIX": 1,
+    "APPENDICES": 1,
+    "附录": 1,
+    "ACKNOWLEDGMENTS": 1,
+    "INTRODUCTION": 1,
+    "BACKGROUNDANDRELATEDWORK": 1,
+    "BACKGROUND": 1,
+    "RELATEDWORK": 1,
+    "THEORETICALMODELS": 1,
+    "DATA": 1,
+    "METHOD": 1,
+    "METHODS": 1,
+    "METHODOLOGY": 1,
+    "TOPICANALYSIS": 1,
+    "RESULT": 1,
+    "RESULTS": 1,
+    "DISCUSSION": 1,
+    "CONCLUSIONS": 1,
+    "CONCLUSION": 1,
+    "LIMITATIONS": 1,
+    "研究背景": 1,
+    "相关工作": 1,
+    "研究方法": 1,
+    "实验结果": 1,
+    "讨论": 1,
+    "结论": 1,
+    "致谢": 1,
+    "目录": 1,
+}
+
+
+def get_title_height(block):
+    """
+    Calculate the average height of the dominant text lines within a layout block.
+    """
+
+    import math
+
+    if block.label == "doc_title":
+        return 0
+
+    # Round down for top-left
+    x1 = int(block.bbox[0])
+    y1 = int(block.bbox[1])
+    # Round up for bottom-right to ensure full coverage
+    x2 = int(math.ceil(block.bbox[2]))
+    y2 = int(math.ceil(block.bbox[3]))
+
+    h, w = y2 - y1, x2 - x1
+    aspect_ratio = w / h
+
+    lines_num = block.content.strip().count("\n") + 1
+
+    if aspect_ratio >= 1.0:
+        # orizontal text: Project to Y-axis
+        return int(h / lines_num)
+    else:
+        # Vertical text: Project to X-axis
+        return int(w / lines_num)
+
+
+def cluster_global_heights(entries, k_clusters=4):
+    """
+    Cluster heading heights to infer level based on font size
+    """
+
+    from sklearn.cluster import KMeans
+
+    heights = [e["height"] for e in entries]
+    uniq = sorted(set(heights))
+
+    if len(uniq) == 0:
+        return {}
+
+    k = min(k_clusters, len(uniq))
+
+    X = np.array(heights).reshape(-1, 1)
+    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
+    km.fit(X)
+
+    centers = km.cluster_centers_.reshape(-1)
+
+    # Sort centers descending: larger font → higher level
+    order = np.argsort(-centers)
+    old2new = {int(old): new_idx + 1 for new_idx, old in enumerate(order)}
+
+    mapping = {}
+    for h in uniq:
+        dists = [abs(h - c) for c in centers]
+        old = int(np.argmin(dists))
+        mapping[h] = old2new[old]
+
+    return mapping
+
+
+def compute_global_symbol_seq(entries, title_symbol_level):
+    """
+    Assign a global ordering to different numbering styles
+    """
+
+    seq = {}
+    counter = 1
+
+    for idx, e in enumerate(entries):
+        symbol, level = title_symbol_level[idx]
+
+        if level > 0 and symbol not in seq:
+            seq[symbol] = counter
+            counter += 1
+
+    return seq
+
+
+def compute_levels_for_entries(entries):
+    """
+    Compute final level for each heading
+    """
+
+    # get title's symbol and level
+    title_symbol_level = {}
+    for idx, e in enumerate(entries):
+        symbol, level = get_symbol_and_level(e["content"])
+        e["symbol"], e["level"] = symbol, level
+        title_symbol_level[idx] = (symbol, level)
+
+    cluster_map = cluster_global_heights(entries)
+    global_seq = compute_global_symbol_seq(entries, title_symbol_level)
+
+    # Used to align multi-level numeric lists (e.g., "1", "1.1", "1.2")
+    first_num_level = 0
+
+    contents = []
+    levels = []
+
+    for idx, e in enumerate(entries):
+
+        if e.get("level") == 0:
+            continue
+
+        symbol, level = title_symbol_level[idx]
+
+        # if matches the semantics in SYMBOL_PATTERNS,bucket the semantic level
+        if level > 0:
+            bucket = "semantic"
+        # Check special keywords (ABSTRACT, REFERENCES, etc.)
+        elif (
+            str(e["content"]).upper().strip().rstrip("：: ").replace(" ", "")
+            in SPECIAL_KEYWORDS
+        ):
+            bucket = "special_word"
+        else:
+            bucket = "cluster"
+
+        cluster_level = cluster_map[e["height"]]
+
+        if bucket == "semantic":
+            semantic_level = level
+
+            if symbol == "NUM_LIST":
+                if first_num_level != 0:
+                    relative_order_level = global_seq.get(symbol) + (
+                        level - first_num_level
+                    )
+                else:
+                    first_num_level = level
+                    relative_order_level = global_seq.get(symbol)
+            else:
+                relative_order_level = global_seq.get(symbol)
+
+            # Voting among three signals
+            votes = [semantic_level, relative_order_level, cluster_level]
+            most_common = Counter(votes).most_common(1)
+
+            if most_common[0][1] > 1:
+                final_level = most_common[0][0]
+            else:
+                final_level = relative_order_level
+
+        elif bucket == "special_word":
+            final_level = SPECIAL_KEYWORDS[
+                str(e["content"]).upper().strip().rstrip("：: ").replace(" ", "")
+            ]
+
+        else:
+            final_level = cluster_level
+
+        e["level"] = int(final_level)
+
+        contents.append(e["content"])
+        levels.append(e["level"])
+
+    return entries
+
+
+def assign_levels_to_parsing_res(blocks_by_page):
+    """
+    Write computed levels back to the parsing results
+    """
+
+    parsing_res_list = []
+
+    for page_index, one_page_blocks in enumerate(blocks_by_page):
+        for block in one_page_blocks:
+            setattr(block, "page_index", page_index)
+            parsing_res_list.append(block)
+
+    entries = []
+
+    for block in parsing_res_list:
+
+        if block.label == "paragraph_title":
+            content = block.content
+            height = get_title_height(block)
+
+            if height is None:
+                continue
+
+            # Document title has fixed level 0
+            init_level = 0 if block.label == "doc_title" else None
+
+            entries.append(
+                {
+                    "origin_block": block,
+                    "content": content,
+                    "height": height,
+                    "level": init_level,
+                }
+            )
+
+    entries = compute_levels_for_entries(entries)
+
+    for e in entries:
+        if e["origin_block"].label == "doc_title":
+            setattr(e["origin_block"], "title_level", 0)
+        block = e["origin_block"]
+        block.title_level = e["level"]
+
+    return blocks_by_page
diff --git a/paddlex/inference/pipelines/layout_parsing/utils.py b/paddlex/inference/pipelines/layout_parsing/utils.py
index d7db3d958b..7c83d00a19 100644
--- a/paddlex/inference/pipelines/layout_parsing/utils.py
+++ b/paddlex/inference/pipelines/layout_parsing/utils.py
@@ -330,18 +330,24 @@ def is_non_breaking_punctuation(char):
     return char in non_breaking_punctuations
 
 
+def construct_img_path(label, box):
+    x_min, y_min, x_max, y_max = list(map(int, box))
+    return f"imgs/img_in_{label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
+
+
 def gather_imgs(original_img, layout_det_objs):
     imgs_in_doc = []
     for det_obj in layout_det_objs:
         if det_obj["label"] in BLOCK_LABEL_MAP["image_labels"]:
             label = det_obj["label"]
             x_min, y_min, x_max, y_max = list(map(int, det_obj["coordinate"]))
-            img_path = f"imgs/img_in_{label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
+            img_path = construct_img_path(label, det_obj["coordinate"])
             img = Image.fromarray(original_img[y_min:y_max, x_min:x_max, ::-1])
             imgs_in_doc.append(
                 {
                     "path": img_path,
                     "img": img,
+                    "label": label,
                     "coordinate": (x_min, y_min, x_max, y_max),
                     "score": det_obj["score"],
                 }
@@ -609,7 +615,7 @@ def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
         formula_res_text: str = formula_res["rec_formula"]
         ocr_res["rec_texts"].append(formula_res_text)
         if ocr_res["rec_boxes"].size == 0:
-            ocr_res["rec_boxes"] = np.array(formula_res["dt_polys"])
+            ocr_res["rec_boxes"] = np.array([formula_res["dt_polys"]])
         else:
             ocr_res["rec_boxes"] = np.vstack(
                 (ocr_res["rec_boxes"], [formula_res["dt_polys"]])
diff --git a/paddlex/inference/pipelines/ocr/pipeline.py b/paddlex/inference/pipelines/ocr/pipeline.py
index 33b54514c5..eb78ee5097 100644
--- a/paddlex/inference/pipelines/ocr/pipeline.py
+++ b/paddlex/inference/pipelines/ocr/pipeline.py
@@ -386,6 +386,11 @@ def predict(
                 )
             ]
 
+            if return_word_box:
+                for res in results:
+                    res["text_word"] = []
+                    res["text_word_region"] = []
+
             indices = list(range(len(doc_preprocessor_images)))
             indices = [idx for idx in indices if len(dt_polys_list[idx]) > 0]
 
@@ -445,9 +450,6 @@ def predict(
                     ):
                         sub_img_id = sorted_subs_info[i]["sub_img_id"]
                         sub_img_info_list[sub_img_id]["rec_res"] = rec_res
-                    if return_word_box:
-                        res["text_word"] = []
-                        res["text_word_region"] = []
                     for sno in range(len(sub_img_info_list)):
                         rec_res = sub_img_info_list[sno]["rec_res"]
                         if rec_res["rec_score"] >= text_rec_score_thresh:
diff --git a/paddlex/inference/pipelines/ocr/result.py b/paddlex/inference/pipelines/ocr/result.py
index 1d9613a9fe..dc16f2c823 100644
--- a/paddlex/inference/pipelines/ocr/result.py
+++ b/paddlex/inference/pipelines/ocr/result.py
@@ -31,40 +31,6 @@
 class OCRResult(BaseCVResult):
     """OCR result"""
 
-    def get_minarea_rect(self, points: np.ndarray) -> np.ndarray:
-        """
-        Get the minimum area rectangle for the given points using OpenCV.
-
-        Args:
-            points (np.ndarray): An array of 2D points.
-
-        Returns:
-            np.ndarray: An array of 2D points representing the corners of the minimum area rectangle
-                     in a specific order (clockwise or counterclockwise starting from the top-left corner).
-        """
-        bounding_box = cv2.minAreaRect(points)
-        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-        index_a, index_b, index_c, index_d = 0, 1, 2, 3
-        if points[1][1] > points[0][1]:
-            index_a = 0
-            index_d = 1
-        else:
-            index_a = 1
-            index_d = 0
-        if points[3][1] > points[2][1]:
-            index_b = 2
-            index_c = 3
-        else:
-            index_b = 3
-            index_c = 2
-
-        box = np.array(
-            [points[index_a], points[index_b], points[index_c], points[index_d]]
-        ).astype(np.int32)
-
-        return box
-
     def _to_img(self) -> Dict[str, Image.Image]:
         """
         Converts the internal data to a PIL Image with detection and recognition results.
@@ -122,7 +88,7 @@ def _to_img(self) -> Dict[str, Image.Image]:
                 if len(box) > 4:
                     pts = [(x, y) for x, y in box.tolist()]
                     draw_left.polygon(pts, outline=color, width=8, fill=color)
-                    box = self.get_minarea_rect(box)
+                    box = get_minarea_rect(box)
                     height = int(0.5 * (max(box[:, 1]) - min(box[:, 1])))
                     box[:2, 1] = np.mean(box[:, 1])
                     box[2:, 1] = np.mean(box[:, 1]) + min(20, height)
@@ -291,3 +257,38 @@ def draw_vertical_text(draw, position, text, font, fill=(0, 0, 0), line_spacing=
         bbox = font.getbbox(char)
         char_height = bbox[3] - bbox[1]
         y += char_height + line_spacing
+
+
+def get_minarea_rect(points: np.ndarray) -> np.ndarray:
+    """
+    Get the minimum area rectangle for the given points using OpenCV.
+
+    Args:
+        points (np.ndarray): An array of 2D points.
+
+    Returns:
+        np.ndarray: An array of 2D points representing the corners of the minimum area rectangle
+                 in a specific order (clockwise or counterclockwise starting from the top-left corner).
+    """
+    bounding_box = cv2.minAreaRect(points)
+    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+
+    index_a, index_b, index_c, index_d = 0, 1, 2, 3
+    if points[1][1] > points[0][1]:
+        index_a = 0
+        index_d = 1
+    else:
+        index_a = 1
+        index_d = 0
+    if points[3][1] > points[2][1]:
+        index_b = 2
+        index_c = 3
+    else:
+        index_b = 3
+        index_c = 2
+
+    box = np.array(
+        [points[index_a], points[index_b], points[index_c], points[index_d]]
+    ).astype(np.int32)
+
+    return box
diff --git a/paddlex/inference/pipelines/paddleocr_vl/pipeline.py b/paddlex/inference/pipelines/paddleocr_vl/pipeline.py
index 10d6e2b296..571393f211 100644
--- a/paddlex/inference/pipelines/paddleocr_vl/pipeline.py
+++ b/paddlex/inference/pipelines/paddleocr_vl/pipeline.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import queue
+import re
 import threading
 import time
 from itertools import chain
@@ -31,19 +32,22 @@
 from .._parallel import AutoParallelImageSimpleInferencePipeline
 from ..base import BasePipeline
 from ..components import CropByBoxes
-from ..layout_parsing.utils import gather_imgs
-from .result import PaddleOCRVLBlock, PaddleOCRVLResult
+from ..layout_parsing.merge_table import merge_tables_across_pages
+from ..layout_parsing.title_level import assign_levels_to_parsing_res
+from ..layout_parsing.utils import construct_img_path, gather_imgs
+from .result import BaseResult, PaddleOCRVLBlock, PaddleOCRVLResult
 from .uilts import (
     convert_otsl_to_html,
     crop_margin,
     filter_overlap_boxes,
     merge_blocks,
+    post_process_for_spotting,
     tokenize_figure_of_table,
     truncate_repetitive_content,
     untokenize_figure_of_table,
 )
 
-IMAGE_LABELS = ["image", "header_image", "footer_image", "seal"]
+IMAGE_LABELS = ["image", "header_image", "footer_image"]
 
 
 @benchmark.time_methods
@@ -57,6 +61,7 @@ def __init__(
         pp_option: Optional[PaddlePredictorOption] = None,
         use_hpip: bool = False,
         hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+        initial_predictor: bool = True,
     ) -> None:
         """
         Initializes the class with given configurations and options.
@@ -70,72 +75,95 @@ def __init__(
             hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
                 The default high-performance inference configuration dictionary.
                 Defaults to None.
+            initial_predictor (bool, optional): Whether to initialize predictors.
         """
         super().__init__(
             device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_config=hpi_config
         )
 
-        self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
-        if self.use_doc_preprocessor:
-            doc_preprocessor_config = config.get("SubPipelines", {}).get(
-                "DocPreprocessor",
-                {
-                    "pipeline_config_error": "config error for doc_preprocessor_pipeline!"
-                },
-            )
-            self.doc_preprocessor_pipeline = self.create_pipeline(
-                doc_preprocessor_config
-            )
+        if initial_predictor:
+            self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
+            if self.use_doc_preprocessor:
+                doc_preprocessor_config = config.get("SubPipelines", {}).get(
+                    "DocPreprocessor",
+                    {
+                        "pipeline_config_error": "config error for doc_preprocessor_pipeline!"
+                    },
+                )
+                self.doc_preprocessor_pipeline = self.create_pipeline(
+                    doc_preprocessor_config
+                )
 
-        self.use_layout_detection = config.get("use_layout_detection", True)
-        if self.use_layout_detection:
-            layout_det_config = config.get("SubModules", {}).get(
-                "LayoutDetection",
-                {"model_config_error": "config error for layout_det_model!"},
-            )
-            model_name = layout_det_config.get("model_name", None)
-            assert (
-                model_name is not None and model_name == "PP-DocLayoutV2"
-            ), "model_name must be PP-DocLayoutV2"
-            layout_kwargs = {}
-            if (threshold := layout_det_config.get("threshold", None)) is not None:
-                layout_kwargs["threshold"] = threshold
-            if (layout_nms := layout_det_config.get("layout_nms", None)) is not None:
-                layout_kwargs["layout_nms"] = layout_nms
-            if (
-                layout_unclip_ratio := layout_det_config.get(
-                    "layout_unclip_ratio", None
+            self.use_layout_detection = config.get("use_layout_detection", True)
+            if self.use_layout_detection:
+                layout_det_config = config.get("SubModules", {}).get(
+                    "LayoutDetection",
+                    {"model_config_error": "config error for layout_det_model!"},
                 )
-            ) is not None:
-                layout_kwargs["layout_unclip_ratio"] = layout_unclip_ratio
-            if (
-                layout_merge_bboxes_mode := layout_det_config.get(
-                    "layout_merge_bboxes_mode", None
+                model_name = layout_det_config.get("model_name", None)
+                assert model_name is not None and model_name in [
+                    "PP-DocLayoutV2",
+                    "PP-DocLayoutV3",
+                ], "model_name must be PP-DocLayoutV2 or PP-DocLayoutV3"
+                layout_kwargs = {}
+                if (threshold := layout_det_config.get("threshold", None)) is not None:
+                    layout_kwargs["threshold"] = threshold
+                if (
+                    layout_nms := layout_det_config.get("layout_nms", None)
+                ) is not None:
+                    layout_kwargs["layout_nms"] = layout_nms
+                if (
+                    layout_unclip_ratio := layout_det_config.get(
+                        "layout_unclip_ratio", None
+                    )
+                ) is not None:
+                    layout_kwargs["layout_unclip_ratio"] = layout_unclip_ratio
+                if (
+                    layout_merge_bboxes_mode := layout_det_config.get(
+                        "layout_merge_bboxes_mode", None
+                    )
+                ) is not None:
+                    layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
+                self.layout_det_model = self.create_model(
+                    layout_det_config, **layout_kwargs
                 )
-            ) is not None:
-                layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
-            self.layout_det_model = self.create_model(
-                layout_det_config, **layout_kwargs
-            )
 
-        self.use_chart_recognition = config.get("use_chart_recognition", True)
+            self.use_chart_recognition = config.get("use_chart_recognition", False)
+            self.use_seal_recognition = config.get("use_seal_recognition", False)
 
-        vl_rec_config = config.get("SubModules", {}).get(
-            "VLRecognition",
-            {"model_config_error": "config error for vl_rec_model!"},
-        )
+            vl_rec_config = config.get("SubModules", {}).get(
+                "VLRecognition",
+                {"model_config_error": "config error for vl_rec_model!"},
+            )
 
-        self.vl_rec_model = self.create_model(vl_rec_config)
-        self.format_block_content = config.get("format_block_content", False)
+            self.vl_rec_model = self.create_model(vl_rec_config)
+            self.format_block_content = config.get("format_block_content", False)
+            self.use_ocr_for_image_block = config.get("use_ocr_for_image_block", False)
 
-        self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
-        self.img_reader = ReadImage(format="BGR")
-        self.crop_by_boxes = CropByBoxes()
+            self.batch_sampler = ImageBatchSampler(
+                batch_size=config.get("batch_size", 1)
+            )
+            self.img_reader = ReadImage(format="BGR")
+            self.crop_by_boxes = CropByBoxes()
 
-        self.use_queues = config.get("use_queues", False)
+            self.use_queues = config.get("use_queues", False)
+            self.merge_layout_blocks = config.get("merge_layout_blocks", True)
+            self.markdown_ignore_labels = config.get(
+                "markdown_ignore_labels",
+                [
+                    "number",
+                    "footnote",
+                    "header",
+                    "header_image",
+                    "footer",
+                    "footer_image",
+                    "aside_text",
+                ],
+            )
 
     def close(self):
-        self.vl_rec_model.close()
+        if hasattr(self, "vl_rec_model"):
+            self.vl_rec_model.close()
 
     def get_model_settings(
         self,
@@ -143,7 +171,11 @@ def get_model_settings(
         use_doc_unwarping: Union[bool, None],
         use_layout_detection: Union[bool, None],
         use_chart_recognition: Union[bool, None],
+        use_seal_recognition: Union[bool, None],
+        use_ocr_for_image_block: Union[bool, None],
         format_block_content: Union[bool, None],
+        merge_layout_blocks: Union[bool, None],
+        markdown_ignore_labels: Optional[list[str]] = None,
     ) -> dict:
         """
         Get the model settings based on the provided parameters or default values.
@@ -170,14 +202,30 @@ def get_model_settings(
         if use_chart_recognition is None:
             use_chart_recognition = self.use_chart_recognition
 
+        if use_seal_recognition is None:
+            use_seal_recognition = self.use_seal_recognition
+
+        if use_ocr_for_image_block is None:
+            use_ocr_for_image_block = self.use_ocr_for_image_block
+
         if format_block_content is None:
             format_block_content = self.format_block_content
 
+        if merge_layout_blocks is None:
+            merge_layout_blocks = self.merge_layout_blocks
+
+        if markdown_ignore_labels is None:
+            markdown_ignore_labels = self.markdown_ignore_labels
+
         return dict(
             use_doc_preprocessor=use_doc_preprocessor,
             use_layout_detection=use_layout_detection,
             use_chart_recognition=use_chart_recognition,
+            use_seal_recognition=use_seal_recognition,
+            use_ocr_for_image_block=use_ocr_for_image_block,
             format_block_content=format_block_content,
+            merge_layout_blocks=merge_layout_blocks,
+            markdown_ignore_labels=markdown_ignore_labels,
         )
 
     def check_model_settings_valid(self, input_params: dict) -> bool:
@@ -205,26 +253,41 @@ def get_layout_parsing_results(
         layout_det_results,
         imgs_in_doc,
         use_chart_recognition=False,
+        use_seal_recognition=False,
+        use_ocr_for_image_block=False,
         vlm_kwargs=None,
+        merge_layout_blocks=True,
+        layout_shape_mode="auto",
     ):
         blocks = []
-        block_imgs = []
-        text_prompts = []
-        vlm_block_ids = []
-        figure_token_maps = []
+        has_spotting = False
         drop_figures_set = set()
-        image_labels = (
-            IMAGE_LABELS if use_chart_recognition else IMAGE_LABELS + ["chart"]
-        )
+        min_pixels = vlm_kwargs.pop("min_pixels", None)
+        default_min_pixels = min_pixels if min_pixels is not None else 112896
+        max_pixels = vlm_kwargs.pop("max_pixels", None)
+        default_max_pixels = max_pixels if max_pixels is not None else 1003520
+
+        batch_dict_by_pixel = {}
+        id2pixel_key_map = {}
+        image_path_to_obj_map = {}
+        vis_image_labels = IMAGE_LABELS + ["seal"]
+        image_labels = [] if use_ocr_for_image_block else IMAGE_LABELS
+        if not use_chart_recognition:
+            image_labels += ["chart"]
+            vis_image_labels += ["chart"]
+        if not use_seal_recognition:
+            image_labels += ["seal"]
         for i, (image, layout_det_res, imgs_in_doc_for_img) in enumerate(
             zip(images, layout_det_results, imgs_in_doc)
         ):
-            layout_det_res = filter_overlap_boxes(layout_det_res)
+            layout_det_res = filter_overlap_boxes(layout_det_res, layout_shape_mode)
             boxes = layout_det_res["boxes"]
-            blocks_for_img = self.crop_by_boxes(image, boxes)
-            blocks_for_img = merge_blocks(
-                blocks_for_img, non_merge_labels=image_labels + ["table"]
-            )
+            blocks_for_img = self.crop_by_boxes(image, boxes, layout_shape_mode)
+            del layout_det_res, boxes
+            if merge_layout_blocks:
+                blocks_for_img = merge_blocks(
+                    blocks_for_img, non_merge_labels=image_labels + ["table"]
+                )
             blocks.append(blocks_for_img)
             for j, block in enumerate(blocks_for_img):
                 block_img = block["img"]
@@ -232,6 +295,8 @@ def get_layout_parsing_results(
                 if block_label not in image_labels and block_img is not None:
                     figure_token_map = {}
                     text_prompt = "OCR:"
+                    min_pixels = vlm_kwargs.pop("ocr_min_pixels", default_min_pixels)
+                    max_pixels = vlm_kwargs.pop("ocr_max_pixels", default_max_pixels)
                     drop_figures = []
                     if block_label == "table":
                         text_prompt = "Table Recognition:"
@@ -240,71 +305,149 @@ def get_layout_parsing_results(
                                 block_img, block["box"], imgs_in_doc_for_img
                             )
                         )
+                        min_pixels = vlm_kwargs.pop(
+                            "table_min_pixels", default_min_pixels
+                        )
+                        max_pixels = vlm_kwargs.pop(
+                            "table_max_pixels", default_max_pixels
+                        )
                     elif block_label == "chart" and use_chart_recognition:
                         text_prompt = "Chart Recognition:"
+                        min_pixels = vlm_kwargs.pop(
+                            "chart_min_pixels", default_min_pixels
+                        )
+                        max_pixels = vlm_kwargs.pop(
+                            "chart_max_pixels", default_max_pixels
+                        )
                     elif "formula" in block_label and block_label != "formula_number":
                         text_prompt = "Formula Recognition:"
-                        block_img = crop_margin(block_img)
-                    block_imgs.append(block_img)
-                    text_prompts.append(text_prompt)
-                    figure_token_maps.append(figure_token_map)
-                    vlm_block_ids.append((i, j))
+                        crop_img = crop_margin(block_img)
+                        w, h, _ = crop_img.shape
+                        if w > 2 and h > 2:
+                            block_img = crop_img
+                        min_pixels = vlm_kwargs.pop(
+                            "formula_min_pixels", default_min_pixels
+                        )
+                        max_pixels = vlm_kwargs.pop(
+                            "formula_max_pixels", default_max_pixels
+                        )
+                    elif block_label == "spotting":
+                        text_prompt = "Spotting:"
+                        has_spotting = True
+                        min_pixels = vlm_kwargs.pop(
+                            "spotting_min_pixels", default_min_pixels
+                        )
+                        max_pixels = vlm_kwargs.pop(
+                            "spotting_max_pixels", default_max_pixels
+                        )
+                    elif block_label == "seal" and use_seal_recognition:
+                        text_prompt = "Seal Recognition:"
+                        min_pixels = vlm_kwargs.pop(
+                            "seal_min_pixels", default_min_pixels
+                        )
+                        max_pixels = vlm_kwargs.pop(
+                            "seal_max_pixels", default_max_pixels
+                        )
+                    pixel_key = (min_pixels, max_pixels)
+                    if pixel_key not in batch_dict_by_pixel:
+                        batch_dict_by_pixel[pixel_key] = {
+                            "images": [],
+                            "queries": [],
+                            "figure_token_maps": [],
+                            "vlm_block_ids": [],
+                            "curr_vlm_block_idx": 0,
+                        }
+                    batch_dict_by_pixel[pixel_key]["images"].append(block_img)
+                    batch_dict_by_pixel[pixel_key]["queries"].append(text_prompt)
+                    batch_dict_by_pixel[pixel_key]["figure_token_maps"].append(
+                        figure_token_map
+                    )
+                    batch_dict_by_pixel[pixel_key]["vlm_block_ids"].append((i, j))
+                    id2pixel_key_map[(i, j)] = pixel_key
                     drop_figures_set.update(drop_figures)
+            del blocks_for_img
+        del images, layout_det_results
 
         if vlm_kwargs is None:
             vlm_kwargs = {}
         elif vlm_kwargs.get("max_new_tokens", None) is None:
             vlm_kwargs["max_new_tokens"] = 4096
 
-        kwargs = {
-            "use_cache": True,
-            **vlm_kwargs,
-        }
-        vl_rec_results = list(
-            self.vl_rec_model.predict(
-                [
-                    {
-                        "image": block_img,
-                        "query": text_prompt,
-                    }
-                    for block_img, text_prompt in zip(block_imgs, text_prompts)
-                ],
-                skip_special_tokens=True,
-                **kwargs,
+        for pixel_key in batch_dict_by_pixel:
+            min_pixels, max_pixels = pixel_key
+            kwargs = {
+                "use_cache": True,
+                "min_pixels": min_pixels,
+                "max_pixels": max_pixels,
+                **vlm_kwargs,
+            }
+            if has_spotting:
+                kwargs.pop("min_pixels", None)
+                kwargs.pop("max_pixels", None)
+            images = batch_dict_by_pixel[pixel_key]["images"]
+            queries = batch_dict_by_pixel[pixel_key]["queries"]
+            batch_results = list(
+                self.vl_rec_model.predict(
+                    [
+                        {
+                            "image": image,
+                            "query": query,
+                        }
+                        for image, query in zip(images, queries)
+                    ],
+                    skip_special_tokens=False if has_spotting else True,
+                    **kwargs,
+                )
             )
-        )
+            del images, queries
+            batch_dict_by_pixel[pixel_key]["vlm_results"] = batch_results
 
         parsing_res_lists = []
         table_res_lists = []
-        curr_vlm_block_idx = 0
+        spotting_res_list = []
+        table_blocks = []
         for i, blocks_for_img in enumerate(blocks):
             parsing_res_list = []
             table_res_list = []
+            spotting_res = {}
             for j, block in enumerate(blocks_for_img):
                 block_img = block["img"]
                 block_bbox = block["box"]
                 block_label = block["label"]
                 block_content = ""
-                if curr_vlm_block_idx < len(vlm_block_ids) and vlm_block_ids[
-                    curr_vlm_block_idx
-                ] == (i, j):
-                    vl_rec_result = vl_rec_results[curr_vlm_block_idx]
-                    figure_token_map = figure_token_maps[curr_vlm_block_idx]
-                    block_img4vl = block_imgs[curr_vlm_block_idx]
+                figure_token_map = {}
+                if (i, j) in id2pixel_key_map:
+                    pixel_key = id2pixel_key_map[(i, j)]
+                    pixel_info = batch_dict_by_pixel[pixel_key]
+                    curr_vlm_block_idx = pixel_info["curr_vlm_block_idx"]
+                    assert curr_vlm_block_idx < len(
+                        pixel_info["vlm_block_ids"]
+                    ) and pixel_info["vlm_block_ids"][curr_vlm_block_idx] == (i, j)
+                    vl_rec_result = pixel_info["vlm_results"][curr_vlm_block_idx]
+                    block_img4vl = pixel_info["images"][curr_vlm_block_idx]
+                    figure_token_map = pixel_info["figure_token_maps"][
+                        curr_vlm_block_idx
+                    ]
                     curr_vlm_block_idx += 1
+                    pixel_info["curr_vlm_block_idx"] = curr_vlm_block_idx
                     vl_rec_result["image"] = block_img4vl
                     result_str = vl_rec_result.get("result", "")
                     if result_str is None:
                         result_str = ""
-                    result_str = truncate_repetitive_content(result_str)
+                    min_count = 5000 if block_label == "table" else 50
+                    result_str = truncate_repetitive_content(
+                        result_str, min_count=min_count
+                    )
                     if ("\\(" in result_str and "\\)" in result_str) or (
                         "\\[" in result_str and "\\]" in result_str
                     ):
                         result_str = result_str.replace("$", "")
 
                         result_str = (
-                            result_str.replace("\(", " $ ")
-                            .replace("\\)", " $ ")
+                            result_str.replace("\\(", " $ ")
+                            .replace("\\)", " $")
+                            .replace("\\[\\[", "\\[")
+                            .replace("\\]\\]", "\\]")
                             .replace("\\[", " $$ ")
                             .replace("\\]", " $$ ")
                         )
@@ -314,20 +457,30 @@ def get_layout_parsing_results(
                         html_str = convert_otsl_to_html(result_str)
                         if html_str != "":
                             result_str = html_str
-                        result_str = untokenize_figure_of_table(
-                            result_str, figure_token_map
+                    if block_label == "spotting":
+                        h, w = block_img.shape[:2]
+                        result_str, spotting_res = post_process_for_spotting(
+                            result_str, w, h
                         )
 
                     block_content = result_str
-
                 block_info = PaddleOCRVLBlock(
                     label=block_label,
                     bbox=block_bbox,
                     content=block_content,
+                    group_id=block.get("group_id", None),
+                    polygon_points=block.get("polygon_points", None),
                 )
-                if block_label in image_labels and block_img is not None:
-                    x_min, y_min, x_max, y_max = list(map(int, block_bbox))
-                    img_path = f"imgs/img_in_{block_label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
+                if block_label == "table":
+                    table_blocks.append(
+                        {
+                            "figure_token_map": figure_token_map,
+                            "block": block_info,
+                        }
+                    )
+                if block_label in vis_image_labels and block_img is not None:
+                    img_path = construct_img_path(block["label"], block["box"])
+                    image_path_to_obj_map[img_path] = block_info
                     if img_path not in drop_figures_set:
                         import cv2
 
@@ -340,10 +493,25 @@ def get_layout_parsing_results(
                         continue
 
                 parsing_res_list.append(block_info)
+                del block_info, block_img
+            # TODO(changdazhou): append table res to table_res_list
+            for blk_info in table_blocks:
+                block = blk_info["block"]
+                figure_token_map = blk_info["figure_token_map"]
+                block.content = untokenize_figure_of_table(
+                    block.content, figure_token_map, image_path_to_obj_map
+                )
             parsing_res_lists.append(parsing_res_list)
             table_res_lists.append(table_res_list)
-
-        return parsing_res_lists, table_res_lists, imgs_in_doc
+            spotting_res_list.append(spotting_res)
+            del parsing_res_list, table_res_list, spotting_res
+
+        return (
+            parsing_res_lists,
+            table_res_lists,
+            spotting_res_list,
+            imgs_in_doc,
+        )
 
     def predict(
         self,
@@ -352,10 +520,13 @@ def predict(
         use_doc_unwarping: Union[bool, None] = False,
         use_layout_detection: Union[bool, None] = None,
         use_chart_recognition: Union[bool, None] = None,
+        use_seal_recognition: Union[bool, None] = None,
+        use_ocr_for_image_block: Union[bool, None] = None,
         layout_threshold: Optional[Union[float, dict]] = None,
         layout_nms: Optional[bool] = None,
         layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
         layout_merge_bboxes_mode: Optional[str] = None,
+        layout_shape_mode: Optional[str] = "auto",
         use_queues: Optional[bool] = None,
         prompt_label: Optional[Union[str, None]] = None,
         format_block_content: Union[bool, None] = None,
@@ -365,6 +536,9 @@ def predict(
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
         max_new_tokens: Optional[int] = None,
+        merge_layout_blocks: Optional[bool] = None,
+        markdown_ignore_labels: Optional[list[str]] = None,
+        vlm_extra_args: Optional[dict] = None,
         **kwargs,
     ) -> PaddleOCRVLResult:
         """
@@ -375,6 +549,9 @@ def predict(
                                                                         numpy array of an image, or list of numpy arrays.
             use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
             use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
+            use_layout_detection (Optional[bool]): Whether to use layout detection. Default is None.
+            use_chart_recognition (Optional[bool]): Whether to use chart recognition. Default is None.
+            use_seal_recognition (Optional[bool]): Whether to use seal recognition. Default is None.
             layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
             layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
             layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
@@ -383,6 +560,7 @@ def predict(
                 If it's a tuple of two numbers, then they are used separately for width and height respectively.
                 If it's None, then no unclipping will be performed.
             layout_merge_bboxes_mode (Optional[str], optional): The mode for merging bounding boxes. Defaults to None.
+            layout_shape_mode (Optional[str], optional): The mode for layout shape. Defaults to "auto", [ "rect", "quad","poly", "auto"] are supported.
             use_queues (Optional[bool], optional): Whether to use queues. Defaults to None.
             prompt_label (Optional[Union[str, None]], optional): The label of the prompt in ['ocr', 'formula', 'table', 'chart']. Defaults to None.
             format_block_content (Optional[bool]): Whether to format the block content. Default is None.
@@ -392,6 +570,8 @@ def predict(
             min_pixels (Optional[int]): The minimum number of pixels allowed when the VL model preprocesses images. Default is None.
             max_pixels (Optional[int]): The maximum number of pixels allowed when the VL model preprocesses images. Default is None.
             max_new_tokens (Optional[int]): The maximum number of new tokens. Default is None.
+            merge_layout_blocks (Optional[bool]): Whether to merge layout blocks. Default is None.
+            markdown_ignore_labels (Optional[list[str]]): The list of ignored markdown labels. Default is None.
             **kwargs (Any): Additional settings to extend functionality.
 
         Returns:
@@ -402,7 +582,15 @@ def predict(
             use_doc_unwarping,
             use_layout_detection,
             use_chart_recognition,
+            use_seal_recognition,
+            use_ocr_for_image_block,
             format_block_content,
+            merge_layout_blocks,
+            markdown_ignore_labels,
+        )
+
+        model_settings["return_layout_polygon_points"] = (
+            False if layout_shape_mode == "rect" else True
         )
 
         if not self.check_model_settings_valid(model_settings):
@@ -411,15 +599,22 @@ def predict(
         if use_queues is None:
             use_queues = self.use_queues
 
+        if vlm_extra_args is None:
+            vlm_extra_args = {}
+
         if not model_settings["use_layout_detection"]:
             prompt_label = prompt_label if prompt_label else "ocr"
             if prompt_label.lower() == "chart":
                 model_settings["use_chart_recognition"] = True
+            elif prompt_label.lower() == "seal":
+                model_settings["use_seal_recognition"] = True
             assert prompt_label.lower() in [
                 "ocr",
                 "formula",
                 "table",
                 "chart",
+                "spotting",
+                "seal",
             ], f"Layout detection is disabled (use_layout_detection=False). 'prompt_label' must be one of ['ocr', 'formula', 'table', 'chart'], but got '{prompt_label}'."
 
         def _process_cv(batch_data, new_batch_size=None):
@@ -430,6 +625,7 @@ def _process_cv(batch_data, new_batch_size=None):
                 instances = batch_data.instances[idx : idx + new_batch_size]
                 input_paths = batch_data.input_paths[idx : idx + new_batch_size]
                 page_indexes = batch_data.page_indexes[idx : idx + new_batch_size]
+                page_counts = batch_data.page_counts[idx : idx + new_batch_size]
 
                 image_arrays = self.img_reader(instances)
 
@@ -449,7 +645,6 @@ def _process_cv(batch_data, new_batch_size=None):
                 doc_preprocessor_images = [
                     item["output_img"] for item in doc_preprocessor_results
                 ]
-
                 if model_settings["use_layout_detection"]:
                     layout_det_results = list(
                         self.layout_det_model(
@@ -458,6 +653,8 @@ def _process_cv(batch_data, new_batch_size=None):
                             layout_nms=layout_nms,
                             layout_unclip_ratio=layout_unclip_ratio,
                             layout_merge_bboxes_mode=layout_merge_bboxes_mode,
+                            layout_shape_mode=layout_shape_mode,
+                            filter_overlap_boxes=False,
                         )
                     )
 
@@ -491,61 +688,78 @@ def _process_cv(batch_data, new_batch_size=None):
                         )
                     imgs_in_doc = [[] for _ in layout_det_results]
 
-                yield input_paths, page_indexes, doc_preprocessor_images, doc_preprocessor_results, layout_det_results, imgs_in_doc
+                yield input_paths, page_indexes, page_counts, doc_preprocessor_images, doc_preprocessor_results, layout_det_results, imgs_in_doc
 
         def _process_vlm(results_cv):
             (
                 input_paths,
                 page_indexes,
+                page_counts,
                 doc_preprocessor_images,
                 doc_preprocessor_results,
                 layout_det_results,
                 imgs_in_doc,
             ) = results_cv
 
-            parsing_res_lists, table_res_lists, imgs_in_doc = (
-                self.get_layout_parsing_results(
-                    doc_preprocessor_images,
-                    layout_det_results,
-                    imgs_in_doc,
-                    model_settings["use_chart_recognition"],
-                    {
-                        "repetition_penalty": repetition_penalty,
-                        "temperature": temperature,
-                        "top_p": top_p,
-                        "min_pixels": min_pixels,
-                        "max_pixels": max_pixels,
-                        "max_new_tokens": max_new_tokens,
-                    },
-                )
+            (
+                parsing_res_lists,
+                table_res_lists,
+                spotting_res_list,
+                imgs_in_doc,
+            ) = self.get_layout_parsing_results(
+                images=doc_preprocessor_images,
+                layout_det_results=layout_det_results,
+                imgs_in_doc=imgs_in_doc,
+                use_chart_recognition=model_settings["use_chart_recognition"],
+                use_seal_recognition=model_settings["use_seal_recognition"],
+                use_ocr_for_image_block=model_settings["use_ocr_for_image_block"],
+                vlm_kwargs={
+                    "repetition_penalty": repetition_penalty,
+                    "temperature": temperature,
+                    "top_p": top_p,
+                    "min_pixels": min_pixels,
+                    "max_pixels": max_pixels,
+                    "max_new_tokens": max_new_tokens,
+                    **vlm_extra_args,
+                },
+                merge_layout_blocks=model_settings["merge_layout_blocks"],
+                layout_shape_mode=layout_shape_mode,
             )
 
             for (
                 input_path,
                 page_index,
+                page_count,
                 doc_preprocessor_image,
                 doc_preprocessor_res,
                 layout_det_res,
                 table_res_list,
                 parsing_res_list,
+                spotting_res,
                 imgs_in_doc_for_img,
             ) in zip(
                 input_paths,
                 page_indexes,
+                page_counts,
                 doc_preprocessor_images,
                 doc_preprocessor_results,
                 layout_det_results,
                 table_res_lists,
                 parsing_res_lists,
+                spotting_res_list,
                 imgs_in_doc,
             ):
                 single_img_res = {
                     "input_path": input_path,
                     "page_index": page_index,
+                    "page_count": page_count,
+                    "width": doc_preprocessor_image.shape[1],
+                    "height": doc_preprocessor_image.shape[0],
                     "doc_preprocessor_res": doc_preprocessor_res,
                     "layout_det_res": layout_det_res,
                     "table_res_list": table_res_list,
                     "parsing_res_list": parsing_res_list,
+                    "spotting_res": spotting_res,
                     "imgs_in_doc": imgs_in_doc_for_img,
                     "model_settings": model_settings,
                 }
@@ -575,6 +789,7 @@ def _worker_input(input_):
                         break
                     else:
                         queue_input.put((True, batch_data))
+                        del batch_data
                 event_data_loading_done.set()
 
             def _worker_cv():
@@ -599,6 +814,8 @@ def _worker_cv():
                             ),
                         ):
                             queue_cv.put((True, results_cv))
+                            del results_cv
+                        del item
                     except Exception as e:
                         queue_cv.put((False, "cv", e))
                         break
@@ -627,7 +844,8 @@ def _worker_vlm():
                             should_break = True
                             break
                         results_cv_list.append(item[1])
-                        for res in results_cv_list[-1][4]:
+                        del item
+                        for res in results_cv_list[-1][5]:
                             num_boxes += len(res["boxes"])
                         if num_boxes >= MAX_NUM_BOXES:
                             break
@@ -643,10 +861,13 @@ def _worker_vlm():
                         list(chain.from_iterable(lists))
                         for lists in zip(*results_cv_list)
                     ]
+                    del results_cv_list
 
                     try:
                         for result_vlm in _process_vlm(merged_results_cv):
                             queue_vlm.put((True, result_vlm))
+                            del result_vlm
+                        del merged_results_cv
                     except Exception as e:
                         queue_vlm.put((False, "vlm", e))
                         break
@@ -682,9 +903,13 @@ def _worker_vlm():
                     results_cv = results_cv_list[0]
                     for res in _process_vlm(results_cv):
                         yield res
+                    del res, results_cv, results_cv_list, batch_data
         finally:
             if use_queues:
                 event_shutdown.set()
+                thread_input.join(timeout=5)
+                if thread_input.is_alive():
+                    logging.warning("Input worker did not terminate in time")
                 thread_cv.join(timeout=5)
                 if thread_cv.is_alive():
                     logging.warning("CV worker did not terminate in time")
@@ -709,14 +934,144 @@ def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
 
         return markdown_texts
 
+    def concatenate_pages(
+        self,
+        res_list: list,
+        merge_table: bool = True,
+        title_level: bool = True,
+        merge_pages: bool = False,
+    ):
+        """Concatenate layout parsing results from multiple pages.
 
-@pipeline_requires_extra("ocr")
-class PaddleOCRVLPipeline(AutoParallelImageSimpleInferencePipeline):
-    entities = "PaddleOCR-VL"
+        Args:
+            res_list: List of page parsing results
+            merge_talble: Whether to merge tables across pages
+            title_level: Whether to assign title levels
+            merge_pages: Whether to concatenate pages using the new consolidate_pages() logic
+
+        Returns:
+            PaddleOCRVLResult: Combined OCR-VL result after merge_table or title_level policy
+        """
+        logging.warning(
+            f"DeprecationWarning: `concatenate_pages()` is deprecated as of v3.3.14 and will be removed in v3.4.0. Please use `restructure_pages()` instead. It provides better support for table merging and title restructuring."
+        )
+        return self.restructure_pages(res_list, merge_table, title_level, merge_pages)
+
+    def restructure_pages(
+        self,
+        res_list: list,
+        merge_tables: bool = True,
+        relevel_titles: bool = True,
+        concatenate_pages: bool = False,
+    ):
+        """Restructure layout parsing results from multiple pages.
+        Args:
+            res_list: List of page parsing results
+            merge_tables: Whether to merge tables across pages
+            relevel_titles: Whether to relevel titles
+            concatenate_pages: Whether to concatenate pages to a single document
+
+        Returns:
+            PaddleOCRVLResult: Combined OCR-VL result after merge_tables or relevel_titles policy
+        """
+
+        if len(res_list) == 0:
+            return []
+
+        def _get_img_obj(block, model_settings):
+            if block.get("image", None):
+                return block["image"]
+            if block["block_label"] in ("image", "seal") or (
+                block["block_label"] == "chart"
+                and not model_settings.get("use_chart_recognition", False)
+            ):
+                path = construct_img_path(block["block_label"], block["block_bbox"])
+                return {"path": path, "img": None}
+            return None
+
+        def _conver_blocks_to_obj(blocks, model_settings):
+            res = []
+            for block in blocks:
+                obj = PaddleOCRVLBlock(
+                    label=block["block_label"],
+                    bbox=block["block_bbox"],
+                    polygon_points=block.get("block_polygon_points", None),
+                    content=re.sub(r"^#+\s", "", block["block_content"]),
+                    group_id=block.get("group_id", None),
+                )
+                if img := _get_img_obj(block, model_settings):
+                    obj.image = img
+                res.append(obj)
+            return res
+
+        global_block_id = 0
+        obj_res_list = []
+        for one_page_res in res_list:
+            if not isinstance(one_page_res, BaseResult):
+                one_page_res = one_page_res["res"]
+                one_page_res["imgs_in_doc"] = []
+                blocks = one_page_res.get("parsing_res_list", [])
+                model_settings = one_page_res.get("model_settings", {})
+                blocks = _conver_blocks_to_obj(blocks, model_settings)
+            else:
+                blocks = one_page_res["parsing_res_list"]
+            parsing_res_list = []
+            for block in blocks:
+                block.global_block_id = global_block_id
+                block.global_group_id = global_block_id
+                global_block_id += 1
+                parsing_res_list.append(block)
+
+            one_page_res["parsing_res_list"] = parsing_res_list
+            obj_res_list.append(one_page_res)
+        res_list = obj_res_list
+
+        blocks_by_page = [res["parsing_res_list"] for res in res_list]
+
+        if merge_tables:
+            blocks_by_page = merge_tables_across_pages(blocks_by_page)
+        if relevel_titles:
+            blocks_by_page = assign_levels_to_parsing_res(blocks_by_page)
+
+        concatenate_res = []
+        if concatenate_pages:
+            all_page_res = res_list[0]
+            all_page_res["parsing_res_list"] = [
+                blk for blks in blocks_by_page for blk in blks
+            ]
+            all_page_res["page_index"] = None
+            all_page_res["page_count"] = len(res_list)
+            if model_settings["use_layout_detection"]:
+                all_page_res["layout_det_res"] = [
+                    res["layout_det_res"] for res in res_list
+                ]
+            if model_settings["use_doc_preprocessor"]:
+                all_page_res["doc_preprocessor_res"] = [
+                    res["doc_preprocessor_res"] for res in res_list
+                ]
+            concatenate_res.append(PaddleOCRVLResult(all_page_res))
+        else:
+            for page_idx, one_page_res in enumerate(res_list):
+                one_page_res["parsing_res_list"] = blocks_by_page[page_idx]
+                concatenate_res.append(PaddleOCRVLResult(one_page_res))
 
+        yield from concatenate_res
+
+
+class _BasePaddleOCRVLPipeline(AutoParallelImageSimpleInferencePipeline):
     @property
     def _pipeline_cls(self):
         return _PaddleOCRVLPipeline
 
     def _get_batch_size(self, config):
         return config.get("batch_size", 1)
+
+
+@pipeline_requires_extra("ocr")
+class PaddleOCRVLPipeline(_BasePaddleOCRVLPipeline):
+    entities = "PaddleOCR-VL"
+
+
+@pipeline_requires_extra("ocr")
+class PaddleOCRVL15Pipeline(_BasePaddleOCRVLPipeline):
+    entities = "PaddleOCR-VL-1.5"
diff --git a/paddlex/inference/pipelines/paddleocr_vl/result.py b/paddlex/inference/pipelines/paddleocr_vl/result.py
index 69f1762559..b07264ec13 100644
--- a/paddlex/inference/pipelines/paddleocr_vl/result.py
+++ b/paddlex/inference/pipelines/paddleocr_vl/result.py
@@ -14,14 +14,18 @@
 
 from __future__ import annotations
 
+import random
 from functools import partial
 
 import numpy as np
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw
 
-from ....utils.fonts import PINGFANG_FONT
+from ....utils import logging
+from ....utils.deps import class_requires_deps, is_dep_available
+from ....utils.fonts import SIMFANG_FONT
 from ...common.result import (
     BaseCVResult,
+    BaseResult,
     HtmlMixin,
     JsonMixin,
     MarkdownMixin,
@@ -32,32 +36,44 @@
     format_first_line_func,
     format_image_plain_func,
     format_image_scaled_by_html_func,
+    format_para_title_func,
     format_text_plain_func,
     format_title_func,
     simplify_table_func,
 )
-
-VISUALIZE_INDEX_LABELS = [
-    "text",
-    "formula",
-    "inline_formula",
-    "display_formula",
-    "algorithm",
-    "reference",
-    "reference_content",
-    "content",
-    "abstract",
-    "paragraph_title",
-    "doc_title",
-    "vertical_text",
-    "ocr",
+from ..ocr.result import draw_box_txt_fine, get_minarea_rect
+
+SKIP_ORDER_LABELS = [
+    "figure_title",
+    "vision_footnote",
+    "image",
+    "chart",
+    "table",
+    "header",
+    "header_image",
+    "footer",
+    "footer_image",
+    "footnote",
+    "aside_text",
 ]
 
+if is_dep_available("opencv-contrib-python"):
+    import cv2
+
 
 class PaddleOCRVLBlock(object):
     """PaddleOCRVL Block Class"""
 
-    def __init__(self, label, bbox, content="") -> None:
+    def __init__(
+        self,
+        label,
+        bbox,
+        content="",
+        group_id=None,
+        polygon_points=None,
+        global_block_id=None,
+        global_group_id=None,
+    ) -> None:
         """
         Initialize a PaddleOCRVLBlock object.
 
@@ -70,6 +86,10 @@ def __init__(self, label, bbox, content="") -> None:
         self.bbox = list(map(int, bbox))
         self.content = content
         self.image = None
+        self.polygon_points = polygon_points
+        self.group_id = group_id
+        self.global_block_id = global_block_id
+        self.global_group_id = global_group_id
 
     def __str__(self) -> str:
         """
@@ -86,6 +106,23 @@ def __repr__(self) -> str:
         return _str
 
 
+def clean_latex_delimiters(formula):
+    """Clean LaTeX delimiters from formula string.
+    
+    Handles common LaTeX math delimiters:
+    - $$ ... $$ (display math)
+    - \\[ ... \\] (display math)
+    - $ ... $ (inline math)
+    """
+    # Remove $$ delimiters
+    formula = formula.replace("$$", "")
+    # Remove \[ \] delimiters (display math)
+    formula = formula.replace(r"\[", "").replace(r"\]", "")
+    # Remove leading/trailing $ (inline math)
+    formula = formula.strip("$")
+    return formula.strip()
+
+
 def merge_formula_and_number(formula, formula_number):
     """
     Merge a formula and its formula number for display.
@@ -97,17 +134,17 @@ def merge_formula_and_number(formula, formula_number):
     Returns:
         str: The merged formula with tag.
     """
-    formula = formula.replace("$$", "")
+    formula = clean_latex_delimiters(formula)
     merge_formula = r"{} \tag*{{{}}}".format(formula, formula_number)
     return f"$${merge_formula}$$"
 
 
 def format_chart2table_func(block):
     lines_list = block.content.split("\n")
-    # 提取表头和内容
+    # get header and rows
     header = lines_list[0].split("|")
     rows = [line.split("|") for line in lines_list[1:]]
-    # 构造HTML表格
+    # construct html table
     html = "<table border=1 style='margin: auto; width: max-content;'>\n"
     html += (
         "  <thead><tr>"
@@ -132,11 +169,18 @@ def format_chart2table_func(block):
 
 def format_table_center_func(block):
     tabel_content = block.content
+
+    tabel_content = tabel_content.replace(
+        "<table>", "<table border=1 style='margin: auto; word-wrap: break-word;'>"
+    )
+
     tabel_content = tabel_content.replace(
-        "<table>", "<table border=1 style='margin: auto; width: max-content;'>"
+        "<th>", "<th style='text-align: center; word-wrap: break-word;'>"
     )
-    tabel_content = tabel_content.replace("<th>", "<th style='text-align: center;'>")
-    tabel_content = tabel_content.replace("<td>", "<td style='text-align: center;'>")
+    tabel_content = tabel_content.replace(
+        "<td>", "<td style='text-align: center; word-wrap: break-word;'>"
+    )
+
     return tabel_content
 
 
@@ -164,7 +208,7 @@ def build_handle_funcs_dict(
         dict: A mapping from block label to handler function.
     """
     return {
-        "paragraph_title": format_title_func,
+        "paragraph_title": format_para_title_func,
         "abstract_title": format_title_func,
         "reference_title": format_title_func,
         "content_title": format_title_func,
@@ -208,9 +252,18 @@ def build_handle_funcs_dict(
         ),
         "algorithm": lambda block: block.content.strip("\n"),
         "seal": seal_func,
+        "spotting": lambda block: block.content,
+        "number": format_text_plain_func,
+        "footnote": format_text_plain_func,
+        "header": format_text_plain_func,
+        "header_image": image_func,
+        "footer": format_text_plain_func,
+        "footer_image": image_func,
+        "aside_text": format_text_plain_func,
     }
 
 
+@class_requires_deps("opencv-contrib-python")
 class PaddleOCRVLResult(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
     """
     PaddleOCRVLResult class for holding and formatting OCR/VL parsing results.
@@ -228,6 +281,12 @@ def __init__(self, data) -> None:
         XlsxMixin.__init__(self)
         MarkdownMixin.__init__(self)
         JsonMixin.__init__(self)
+        markdown_ignore_labels = self["model_settings"].get(
+            "markdown_ignore_labels", []
+        )
+        self.skip_order_labels = [
+            label for label in SKIP_ORDER_LABELS + markdown_ignore_labels
+        ]
 
     def _to_img(self) -> dict[str, np.ndarray]:
         """
@@ -236,39 +295,71 @@ def _to_img(self) -> dict[str, np.ndarray]:
         Returns:
             dict: Keys are names, values are numpy arrays (images).
         """
-        from ..layout_parsing.utils import get_show_color
 
         res_img_dict = {}
         model_settings = self["model_settings"]
         if model_settings["use_doc_preprocessor"]:
-            for key, value in self["doc_preprocessor_res"].img.items():
-                res_img_dict[key] = value
+            if isinstance(self["doc_preprocessor_res"], BaseResult):
+                for key, value in self["doc_preprocessor_res"].img.items():
+                    res_img_dict[key] = value
+            if isinstance(self["doc_preprocessor_res"], list):
+                for idx, doc_preprocessor_res in enumerate(
+                    self["doc_preprocessor_res"]
+                ):
+                    if isinstance(doc_preprocessor_res, BaseResult):
+                        for key, value in doc_preprocessor_res.img.items():
+                            res_img_dict[f"{key}_{idx}"] = value
         if self["model_settings"]["use_layout_detection"]:
-            res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
-
-        # for layout ordering image
-        image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
-        draw = ImageDraw.Draw(image, "RGBA")
-        font_size = int(0.018 * int(image.width)) + 2
-        font = ImageFont.truetype(PINGFANG_FONT.path, font_size, encoding="utf-8")
-        parsing_result = self["parsing_res_list"]
-        order_index = 0
-        for block in parsing_result:
-            bbox = block.bbox
-            label = block.label
-            fill_color = get_show_color(label, False)
-            draw.rectangle(bbox, fill=fill_color)
-            if label in VISUALIZE_INDEX_LABELS:
-                text_position = (bbox[2] + 2, bbox[1] - font_size // 2)
-                if int(image.width) - bbox[2] < font_size:
-                    text_position = (
-                        int(bbox[2] - font_size * 1.1),
-                        bbox[1] - font_size // 2,
+            if isinstance(self["layout_det_res"], BaseResult):
+                res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
+            if isinstance(self["layout_det_res"], list):
+                for idx, layout_res in enumerate(self["layout_det_res"]):
+                    if isinstance(layout_res, BaseResult):
+                        res_img_dict[f"layout_det_res_{idx}"] = layout_res.img["res"]
+
+        if self.get("spotting_res") and not isinstance(self["spotting_res"], list):
+            boxes = self["spotting_res"]["rec_polys"]
+            txts = self["spotting_res"]["rec_texts"]
+            image = self["doc_preprocessor_res"]["output_img"][:, :, ::-1]
+            h, w = image.shape[0:2]
+            img_left = Image.fromarray(image)
+            img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
+            random.seed(0)
+            draw_left = ImageDraw.Draw(img_left)
+            vis_font = SIMFANG_FONT
+            for idx, (box, txt) in enumerate(zip(boxes, txts)):
+                try:
+                    color = (
+                        random.randint(0, 255),
+                        random.randint(0, 255),
+                        random.randint(0, 255),
                     )
-                draw.text(text_position, str(order_index + 1), font=font, fill="red")
-                order_index += 1
-
-        res_img_dict["layout_order_res"] = image
+                    box = np.array(box)
+                    if len(box) > 4:
+                        pts = [(x, y) for x, y in box.tolist()]
+                        draw_left.polygon(pts, outline=color, width=8, fill=color)
+                        box = get_minarea_rect(box)
+                        height = int(0.5 * (max(box[:, 1]) - min(box[:, 1])))
+                        box[:2, 1] = np.mean(box[:, 1])
+                        box[2:, 1] = np.mean(box[:, 1]) + min(20, height)
+                    else:
+                        box_pts = [(int(x), int(y)) for x, y in box.tolist()]
+                        draw_left.polygon(box_pts, fill=color)
+                    if isinstance(txt, tuple):
+                        txt = txt[0]
+                    img_right_text = draw_box_txt_fine((w, h), box, txt, vis_font.path)
+                    pts = np.array(box, np.int32).reshape((-1, 1, 2))
+                    cv2.polylines(img_right_text, [pts], True, color, 1)
+                    img_right = cv2.bitwise_and(img_right, img_right_text)
+                except:
+                    continue
+
+            img_left = Image.blend(Image.fromarray(image), img_left, 0.5)
+            img_show = Image.new("RGB", (w * 2, h), (255, 255, 255))
+            img_show.paste(img_left, (0, 0, w, h))
+            img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
+
+            res_img_dict["spotting_res_img"] = img_show
 
         return res_img_dict
 
@@ -280,7 +371,7 @@ def _to_html(self) -> dict[str, str]:
             dict: The str type HTML representation result.
         """
         res_html_dict = {}
-        if len(self["table_res_list"]) > 0:
+        if self.get("table_res_list") and len(self["table_res_list"]) > 0:
             for sno in range(len(self["table_res_list"])):
                 table_res = self["table_res_list"][sno]
                 table_region_id = table_res["table_region_id"]
@@ -296,7 +387,7 @@ def _to_xlsx(self) -> dict[str, str]:
             dict: The str type XLSX representation result.
         """
         res_xlsx_dict = {}
-        if len(self["table_res_list"]) > 0:
+        if self.get("table_res_list") and len(self["table_res_list"]) > 0:
             for sno in range(len(self["table_res_list"])):
                 table_res = self["table_res_list"][sno]
                 table_region_id = table_res["table_region_id"]
@@ -318,6 +409,9 @@ def _to_str(self, *args, **kwargs) -> dict[str, str]:
         data = {}
         data["input_path"] = self["input_path"]
         data["page_index"] = self["page_index"]
+        data["page_count"] = self["page_count"]
+        data["width"] = self["width"]
+        data["height"] = self["height"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
         if self["model_settings"]["use_doc_preprocessor"]:
@@ -347,13 +441,22 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
         Returns:
             dict: A dictionary containing the object's data in JSON format.
         """
+        _keep_img = kwargs.pop("keep_img", False)
+
         data = {}
         data["input_path"] = self["input_path"]
         data["page_index"] = self["page_index"]
+        data["page_count"] = self["page_count"]
+        data["width"] = self["width"]
+        data["height"] = self["height"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
+        use_seal_recognition = self["model_settings"].get("use_seal_recognition", False)
         if self["model_settings"].get("format_block_content", False):
-            original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+            original_image_width = self["width"]
+            use_ocr_for_image_block = self["model_settings"].get(
+                "use_ocr_for_image_block", False
+            )
             format_text_func = lambda block: format_centered_by_html(
                 format_text_plain_func(block)
             )
@@ -361,7 +464,18 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
                 format_image_scaled_by_html_func(
                     block,
                     original_image_width=original_image_width,
-                )
+                    show_ocr_content=use_ocr_for_image_block,
+                ),
+                remove_symbol=not use_ocr_for_image_block,
+            )
+
+            format_seal_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                    show_ocr_content=True,
+                ),
+                remove_symbol=use_seal_recognition,
             )
 
             if self["model_settings"].get("use_chart_recognition", False):
@@ -369,10 +483,11 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
             else:
                 format_chart_func = format_image_func
 
-            format_seal_func = format_image_func
+            if not self["model_settings"].get("use_layout_detection", False):
+                format_seal_func = format_text_func
 
             format_table_func = lambda block: "\n" + format_table_center_func(block)
-            format_formula_func = lambda block: block.content
+            format_formula_func = lambda block: clean_latex_delimiters(block.content)
 
             handle_funcs_dict = build_handle_funcs_dict(
                 text_func=format_text_func,
@@ -388,7 +503,7 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
         order_index = 1
         for idx, parsing_res in enumerate(parsing_res_list):
             label = parsing_res.label
-            if label in VISUALIZE_INDEX_LABELS:
+            if label not in self.skip_order_labels:
                 order = order_index
                 order_index += 1
             else:
@@ -399,7 +514,26 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
                 "block_bbox": parsing_res.bbox,
                 "block_id": idx,
                 "block_order": order,
+                "group_id": (
+                    parsing_res.group_id if parsing_res.group_id is not None else idx
+                ),
             }
+            if (
+                hasattr(parsing_res, "global_block_id")
+                and parsing_res.global_block_id is not None
+            ):
+                res_dict["global_block_id"] = parsing_res.global_block_id
+            if (
+                hasattr(parsing_res, "global_group_id")
+                and parsing_res.global_group_id is not None
+            ):
+                res_dict["global_group_id"] = parsing_res.global_group_id
+            if parsing_res.polygon_points is not None:
+                res_dict["block_polygon_points"] = parsing_res.polygon_points
+
+            if _keep_img and parsing_res.image is not None:
+                res_dict["image"] = parsing_res.image
+
             if self["model_settings"].get("format_block_content", False):
                 if handle_funcs_dict.get(parsing_res.label):
                     res_dict["block_content"] = handle_funcs_dict[parsing_res.label](
@@ -410,10 +544,21 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
 
             parsing_res_list_json.append(res_dict)
         data["parsing_res_list"] = parsing_res_list_json
+        if self.get("spotting_res"):
+            if isinstance(self["spotting_res"], list):
+                data["spotting_res"] = [res for res in self["spotting_res"]]
+            else:
+                data["spotting_res"] = self["spotting_res"]
         if self["model_settings"]["use_doc_preprocessor"]:
-            data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
+            if isinstance(self["doc_preprocessor_res"], BaseResult):
+                data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
+            else:
+                data["doc_preprocessor_res"] = self["doc_preprocessor_res"]
         if self["model_settings"]["use_layout_detection"]:
-            data["layout_det_res"] = self["layout_det_res"].json["res"]
+            if isinstance(self["layout_det_res"], BaseResult):
+                data["layout_det_res"] = self["layout_det_res"].json["res"]
+            else:
+                data["layout_det_res"] = self["layout_det_res"]
         return JsonMixin._to_json(data, *args, **kwargs)
 
     def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
@@ -427,7 +572,15 @@ def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
         Returns:
             dict: Markdown information with text and images.
         """
-        original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+
+        use_ocr_for_image_block = self["model_settings"].get(
+            "use_ocr_for_image_block", False
+        )
+        use_seal_recognition = self["model_settings"].get("use_seal_recognition", False)
+        if isinstance(self["width"], list):
+            original_image_width = self["width"][0]
+        else:
+            original_image_width = self["width"]
 
         if pretty:
             format_text_func = lambda block: format_centered_by_html(
@@ -437,11 +590,159 @@ def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
                 format_image_scaled_by_html_func(
                     block,
                     original_image_width=original_image_width,
+                    show_ocr_content=use_ocr_for_image_block,
+                ),
+                remove_symbol=not use_ocr_for_image_block,
+            )
+            format_seal_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                    show_ocr_content=use_seal_recognition,
+                ),
+                remove_symbol=False,
+            )
+        else:
+            format_text_func = lambda block: block.content
+            format_image_func = lambda block: format_image_plain_func(
+                block, use_ocr_for_image_block
+            )
+            format_seal_func = lambda block: format_image_plain_func(
+                block, use_seal_recognition
+            )
+
+        format_chart_func = (
+            format_chart2table_func
+            if self["model_settings"]["use_chart_recognition"]
+            else format_image_func
+        )
+
+        if not self["model_settings"].get("use_layout_detection", False):
+            format_seal_func = format_text_func
+
+        if pretty:
+            format_table_func = lambda block: "\n" + format_table_center_func(block)
+        else:
+            format_table_func = lambda block: simplify_table_func("\n" + block.content)
+
+        format_formula_func = lambda block: clean_latex_delimiters(block.content)
+
+        handle_funcs_dict = build_handle_funcs_dict(
+            text_func=format_text_func,
+            image_func=format_image_func,
+            chart_func=format_chart_func,
+            table_func=format_table_func,
+            formula_func=format_formula_func,
+            seal_func=format_seal_func,
+        )
+        for label in self["model_settings"].get("markdown_ignore_labels", []):
+            handle_funcs_dict.pop(label, None)
+
+        markdown_content = ""
+        markdown_info = {}
+        markdown_info["markdown_images"] = {}
+        for idx, block in enumerate(self["parsing_res_list"]):
+            label = block.label
+            if block.image is not None:
+                markdown_info["markdown_images"][block.image["path"]] = block.image[
+                    "img"
+                ]
+            handle_func = handle_funcs_dict.get(label, None)
+            if (
+                show_formula_number
+                and (label == "display_formula" or label == "formula")
+                and idx != len(self["parsing_res_list"]) - 1
+            ):
+                next_block = self["parsing_res_list"][idx + 1]
+                next_block_label = next_block.label
+                if next_block_label == "formula_number":
+                    block.content = merge_formula_and_number(
+                        block.content, next_block.content
+                    )
+            if handle_func:
+                markdown_content += (
+                    "\n\n" + handle_func(block)
+                    if markdown_content
+                    else handle_func(block)
                 )
+
+        markdown_info["page_index"] = self["page_index"]
+        markdown_info["input_path"] = self["input_path"]
+        markdown_info["markdown_texts"] = markdown_content
+        for img in self["imgs_in_doc"]:
+            markdown_info["markdown_images"][img["path"]] = img["img"]
+
+        return markdown_info
+
+
+class PaddleOCRVLPagesResult(PaddleOCRVLResult):
+    def save_to_img(self, *args, **kwargs):
+        logging.warning(
+            f"The result of multi-pages don't support to save as image format!"
+        )
+        return None
+
+    def save_to_html(self, *args, **kwargs):
+        logging.warning(
+            f"The result of multi-pages don't support to save as html format!"
+        )
+        return None
+
+    def save_to_xlsx(self, *args, **kwargs):
+        logging.warning(
+            f"The result of multi-pages don't support to save as xlsx format!"
+        )
+        return None
+
+    def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
+        """
+        Save the parsing result to a Markdown file.
+
+        Args:
+            pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
+            show_formula_number (bool): whether to show formula numbers.
+
+        Returns:
+            dict: Markdown information with text and images.
+        """
+
+        use_ocr_for_image_block = self["model_settings"].get(
+            "use_ocr_for_image_block", False
+        )
+        use_seal_recognition = self["model_settings"].get("use_seal_recognition", False)
+        if isinstance(self["width"], list):
+            original_image_width = self["width"][0]
+        else:
+            original_image_width = self["width"]
+
+        if pretty:
+            format_text_func = lambda block: format_centered_by_html(
+                format_text_plain_func(block)
+            )
+            format_image_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                    show_ocr_content=use_ocr_for_image_block,
+                ),
+                remove_symbol=not use_ocr_for_image_block,
+            )
+            format_seal_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                    show_ocr_content=use_seal_recognition,
+                ),
+                remove_symbol=False,
             )
         else:
             format_text_func = lambda block: block.content
-            format_image_func = format_image_plain_func
+            format_image_func = lambda block: format_image_plain_func(
+                block, use_ocr_for_image_block
+            )
+            format_seal_func = lambda block: format_image_plain_func(
+                block, use_seal_recognition
+            )
 
         format_chart_func = (
             format_chart2table_func
@@ -454,8 +755,7 @@ def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
         else:
             format_table_func = lambda block: simplify_table_func("\n" + block.content)
 
-        format_formula_func = lambda block: block.content
-        format_seal_func = format_image_func
+        format_formula_func = lambda block: clean_latex_delimiters(block.content)
 
         handle_funcs_dict = build_handle_funcs_dict(
             text_func=format_text_func,
@@ -465,6 +765,8 @@ def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
             formula_func=format_formula_func,
             seal_func=format_seal_func,
         )
+        for label in self["model_settings"].get("markdown_ignore_labels", []):
+            handle_funcs_dict.pop(label, None)
 
         markdown_content = ""
         markdown_info = {}
diff --git a/paddlex/inference/pipelines/paddleocr_vl/uilts.py b/paddlex/inference/pipelines/paddleocr_vl/uilts.py
index 8cd650fd89..573dfaa152 100644
--- a/paddlex/inference/pipelines/paddleocr_vl/uilts.py
+++ b/paddlex/inference/pipelines/paddleocr_vl/uilts.py
@@ -31,8 +31,52 @@
 )
 
 
+def make_valid(poly):
+    if not poly.is_valid:
+        poly = poly.buffer(0)
+    return poly
+
+
+def calculate_polygon_overlap_ratio(
+    polygon1: List[Tuple[int, int]],
+    polygon2: List[Tuple[int, int]],
+    mode: str = "union",
+) -> float:
+    """
+    Calculate the overlap ratio between two polygons.
+
+    Args:
+        polygon1 (List[Tuple[int, int]]): First polygon represented as a list of points.
+        polygon2 (List[Tuple[int, int]]): Second polygon represented as a list of points.
+        mode (str, optional): Overlap calculation mode. Defaults to "union".
+
+    Returns:
+        float: Overlap ratio value between 0 and 1.
+    """
+    try:
+        from shapely.geometry import Polygon
+    except ImportError:
+        raise ImportError("Please install Shapely library.")
+    poly1 = Polygon(polygon1)
+    poly2 = Polygon(polygon2)
+    poly1 = make_valid(poly1)
+    poly2 = make_valid(poly2)
+    intersection = poly1.intersection(poly2).area
+    union = poly1.union(poly2).area
+    if mode == "union":
+        return intersection / union
+    elif mode == "small":
+        small_area = min(poly1.area, poly2.area)
+        return intersection / small_area
+    elif mode == "large":
+        large_area = max(poly1.area, poly2.area)
+        return intersection / large_area
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+
+
 def filter_overlap_boxes(
-    layout_det_res: Dict[str, List[Dict]]
+    layout_det_res: Dict[str, List[Dict]], layout_shape_mode: str
 ) -> Dict[str, List[Dict]]:
     """
     Remove overlapping boxes from layout detection results based on a given overlap ratio.
@@ -50,18 +94,41 @@ def filter_overlap_boxes(
     dropped_indexes = set()
 
     for i in range(len(boxes)):
+        x1, y1, x2, y2 = boxes[i]["coordinate"]
+        w, h = x2 - x1, y2 - y1
+        if w < 6 or h < 6:
+            dropped_indexes.add(i)
         for j in range(i + 1, len(boxes)):
             if i in dropped_indexes or j in dropped_indexes:
                 continue
             overlap_ratio = calculate_overlap_ratio(
                 boxes[i]["coordinate"], boxes[j]["coordinate"], "small"
             )
+            if (
+                boxes[i]["label"] == "inline_formula"
+                or boxes[j]["label"] == "inline_formula"
+            ):
+                if overlap_ratio > 0.5:
+                    if boxes[i]["label"] == "inline_formula":
+                        dropped_indexes.add(i)
+                    if boxes[j]["label"] == "inline_formula":
+                        dropped_indexes.add(j)
+                    continue
             if overlap_ratio > 0.7:
+                if layout_shape_mode != "rect" and "polygon_points" in boxes[i]:
+                    poly_overlap_ratio = calculate_polygon_overlap_ratio(
+                        boxes[i]["polygon_points"], boxes[j]["polygon_points"], "small"
+                    )
+                    if poly_overlap_ratio < 0.7:
+                        continue
                 box_area_i = calculate_bbox_area(boxes[i]["coordinate"])
                 box_area_j = calculate_bbox_area(boxes[j]["coordinate"])
-                if (
-                    boxes[i]["label"] == "image" or boxes[j]["label"] == "image"
-                ) and boxes[i]["label"] != boxes[j]["label"]:
+                if {boxes[i]["label"], boxes[j]["label"]} & {
+                    "image",
+                    "table",
+                    "seal",
+                    "chart",
+                } and boxes[i]["label"] != boxes[j]["label"]:
                     continue
                 if box_area_i >= box_area_j:
                     dropped_indexes.add(j)
@@ -120,7 +187,7 @@ def calc_merged_wh(images):
     return w, h
 
 
-def merge_images(images, aligns="center"):
+def merge_images(images, aligns="center", layout_shape_mode="auto"):
     """
     Merge images vertically with given alignment.
 
@@ -139,6 +206,7 @@ def merge_images(images, aligns="center"):
         aligns = [aligns] * (len(images) - 1)
     if len(aligns) != len(images) - 1:
         raise ValueError("The length of aligns must be len(images) - 1")
+    # TODO(changdazhou): need to support merge by polygon
     merged = to_pil_image(images[0])
     for i in range(1, len(images)):
         img2 = to_pil_image(images[i])
@@ -160,7 +228,7 @@ def merge_images(images, aligns="center"):
     return to_np_array(merged)
 
 
-def merge_blocks(blocks, non_merge_labels):
+def merge_blocks(blocks, non_merge_labels, layout_shape_mode="auto"):
     """
     Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.
 
@@ -204,7 +272,10 @@ def overlapwith_other_box(block_idx, prev_idx, blocks):
         y2 = max(prev_bbox[3], block_bbox[3])
         min_box = [x1, y1, x2, y2]
         for idx, other_block in enumerate(blocks):
-            if idx in [block_idx, prev_idx]:
+            if (
+                idx in [block_idx, prev_idx]
+                or other_block["label"] not in non_merge_labels
+            ):
                 continue
             other_bbox = other_block["box"]
             if calculate_overlap_ratio(min_box, other_bbox) > 0:
@@ -259,15 +330,15 @@ def overlapwith_other_box(block_idx, prev_idx, blocks):
             current_indices.append(idx)
             current_aligns.append(align_mode)
         else:
-            merged_groups.append((current_indices, current_group, current_aligns))
+            merged_groups.append((current_indices, current_aligns))
             current_group = [block]
             current_indices = [idx]
             current_aligns = []
     if current_group:
-        merged_groups.append((current_indices, current_group, current_aligns))
+        merged_groups.append((current_indices, current_aligns))
 
     group_ranges = []
-    for group_indices, group, aligns in merged_groups:
+    for group_indices, aligns in merged_groups:
         start, end = min(group_indices), max(group_indices)
         group_ranges.append((start, end, group_indices, aligns))
 
@@ -276,9 +347,7 @@ def overlapwith_other_box(block_idx, prev_idx, blocks):
     idx = 0
     while idx < len(blocks):
         group_found = False
-        for (start, end, group_indices, aligns), (g_indices, g_blocks, g_aligns) in zip(
-            group_ranges, merged_groups
-        ):
+        for start, end, group_indices, aligns in group_ranges:
             if idx == start and all(i not in used_indices for i in group_indices):
                 group_found = True
                 imgs = [blocks[i]["img"] for i in group_indices]
@@ -293,11 +362,12 @@ def overlapwith_other_box(block_idx, prev_idx, blocks):
                         result_blocks.append(block)
                         used_indices.add(block_idx)
                 else:
-                    merged_img = merge_images(imgs, merge_aligns)
+                    merged_img = merge_images(imgs, merge_aligns, layout_shape_mode)
                     for j, block_idx in enumerate(group_indices):
                         block = blocks[block_idx].copy()
                         block["img"] = merged_img if j == 0 else None
                         block["merge_aligns"] = merge_aligns if j == 0 else None
+                        block["group_id"] = group_indices[0]
                         result_blocks.append(block)
                         used_indices.add(block_idx)
                 insert_list = []
@@ -434,12 +504,13 @@ def gen_random_map(num):
             ]
             token_str = "[F" + str(random_map[figure_id]) + "]"
             table_block_img = paint_token(table_block_img, draw_box, token_str)
-            token_map[token_str] = f'<img src="{figure["path"]}" >'
+            # token_map[token_str] = f'<img src="{figure["path"]}" >'
+            token_map[token_str] = figure["path"]
     drop_figures = [f["path"] for i, f in enumerate(figures) if i in drop_idxes]
     return table_block_img, token_map, drop_figures
 
 
-def untokenize_figure_of_table(table_res_str, figure_token_map):
+def untokenize_figure_of_table(table_res_str, figure_token_map, image_path_to_obj_map):
     """
     Replace tokens in a string with their HTML image equivalents.
 
@@ -454,7 +525,22 @@ def untokenize_figure_of_table(table_res_str, figure_token_map):
     def repl(match):
         token_id = match.group(1)
         token = f"[F{token_id}]"
-        return figure_token_map.get(token, match.group(0))
+        img_path = figure_token_map.get(token, match.group(0))
+        img_block = image_path_to_obj_map.get(img_path, None)
+        if img_block is None:
+            return match.group(0)
+        else:
+            img_tags = []
+            img_tags.append(
+                '<img src="{}" alt="Image"" />'.format(
+                    img_path.replace("-\n", "").replace("\n", " ")
+                ),
+            )
+            image_info = "\n".join(img_tags)
+            if img_block.content != "":
+                ocr_content = img_block.content
+                image_info += "\n\n" + ocr_content + "\n\n"
+            return image_info
 
     pattern = r"\[F(\d+)\]"
     return re.sub(pattern, repl, table_res_str)
@@ -876,7 +962,11 @@ def find_repeating_suffix(
 
 
 def truncate_repetitive_content(
-    content: str, line_threshold: int = 10, char_threshold: int = 10, min_len: int = 10
+    content: str,
+    line_threshold: int = 10,
+    char_threshold: int = 10,
+    min_len: int = 10,
+    min_count: int = 3000,
 ) -> str:
     """
     Detect and truncate character-level, phrase-level, or line-level repetition in content.
@@ -890,6 +980,9 @@ def truncate_repetitive_content(
     Returns:
         Union[str, str]: (truncated_content, info_string)
     """
+    if len(content) < min_count:
+        return content
+
     stripped_content = content.strip()
     if not stripped_content:
         return content
@@ -955,3 +1048,62 @@ def crop_margin(img):
     cropped = img[y : y + h, x : x + w]
 
     return cropped
+
+
+ANNOT_TEXT_RE = re.compile(r"<\|TEXT_START\|>(.*?)<\|TEXT_END\|>", re.S)
+LOC_BLOCK_RE = re.compile(r"<\|LOC_BEGIN\|>(.*?)<\|LOC_END\|>", re.S)
+LOC_ITEM_RE = re.compile(r"<\|LOC_(\d+)\|>")
+LOC_TOKEN_RE = re.compile(r"<\|LOC_(\d+)\|>")
+
+
+def post_process_for_spotting(
+    input_str: str, w: int, h: int
+) -> Tuple[str, Dict[str, List]]:
+    """
+    Post-process the input string to extract text and location blocks.
+    """
+    assert isinstance(input_str, str)
+
+    # Extract text and location blocks
+    texts = ANNOT_TEXT_RE.findall(input_str)
+    loc_blocks = LOC_BLOCK_RE.findall(input_str)
+
+    rec_polys = []
+    rec_texts = []
+
+    # Process the extracted text and location blocks
+    n = min(len(texts), len(loc_blocks))
+    for i in range(n):
+        txt = texts[i].strip()
+        loc_items = LOC_ITEM_RE.findall(loc_blocks[i])
+        if len(loc_items) < 8:
+            continue
+        # Take the first 8 items (4 points)
+        vals = list(map(int, loc_items[:8]))
+        pts = [(vals[j], vals[j + 1]) for j in range(0, 8, 2)]
+        pts = [(p[0] / 1000.0 * w, p[1] / 1000.0 * h) for p in pts]
+        rec_polys.append(pts)
+        rec_texts.append(txt)
+
+    # If no polys or texts are extracted, try an alternative parsing method
+    if not rec_polys or not rec_texts:
+        matches = list(LOC_TOKEN_RE.finditer(input_str))
+        last_end = 0
+        i = 0
+        while i + 7 < len(matches):
+            group = matches[i : i + 8]
+            vals = [int(m.group(1)) for m in group]
+            pts = [(vals[j], vals[j + 1]) for j in range(0, 8, 2)]
+            pts = [(p[0] / 1000.0 * w, p[1] / 1000.0 * h) for p in pts]
+            text_span = input_str[last_end : group[0].start()]
+            txt = text_span.strip()
+            rec_texts.append(txt)
+            rec_polys.append(pts)
+            last_end = group[-1].end()
+            i += 8
+
+    # Join the extracted texts into a single string separated by newlines
+    result_str = "\n\n".join(rec_texts)
+    spotting_res = {"rec_polys": rec_polys, "rec_texts": rec_texts}
+
+    return result_str, spotting_res
diff --git a/paddlex/inference/pipelines/pp_doctranslation/pipeline.py b/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
index 91774244e3..8b81b52095 100644
--- a/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
@@ -488,7 +488,6 @@ def translate_func(text):
                     "markdown_texts": target_language_texts,
                 }
             )
-
     def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
         """
         Concatenate Markdown content from multiple pages into a single document.
@@ -617,4 +616,4 @@ def concatenate_latex_pages(self, latex_info_list: list) -> tuple:
                 "images": merged_images,
                 "input_path": latex_info_list[0]["input_path"],
             }
-        )
+        )
\ No newline at end of file
diff --git a/paddlex/inference/pipelines/pp_doctranslation/utils.py b/paddlex/inference/pipelines/pp_doctranslation/utils.py
index a66522f4f1..e280be49ca 100644
--- a/paddlex/inference/pipelines/pp_doctranslation/utils.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/utils.py
@@ -173,9 +173,10 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
     Returns:
         None
     """
-    from bs4 import BeautifulSoup
     import copy
 
+    from bs4 import BeautifulSoup
+
     # If the HTML is short and simple, translate directly
     if (
         html_block.count("<") < 5
@@ -203,7 +204,7 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
                 td_batch_nodes.append(parent_td)
                 td_batch_texts.append(td_text)
             td_seen.add(id(parent_td))
-            
+
     # Process <td>/<th> nodes in batches
     batch_size = chunk_size
     i = 0
@@ -212,12 +213,15 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
         batch_nodes = []
         batch_texts = []
         current_length = 0
-        while i < len(td_batch_nodes) and current_length + len(td_batch_texts[i]) <= batch_size:
+        while (
+            i < len(td_batch_nodes)
+            and current_length + len(td_batch_texts[i]) <= batch_size
+        ):
             batch_nodes.append(td_batch_nodes[i])
             batch_texts.append(td_batch_texts[i])
             current_length += len(td_batch_texts[i])
             i += 1
-        
+
         # Translate the batch and reinsert translated content
         placeholder = "__TD__"
         batch_text = placeholder.join(batch_texts)
@@ -230,7 +234,6 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
             for child in frag.contents:
                 td_node.append(copy.deepcopy(child))
 
-
     text_nodes = []
     for node in soup.find_all(string=True, recursive=True):
         if not node.find_parent(["td", "th"]) and node.strip():
@@ -245,7 +248,9 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
         while idx < total:
             node_text = text_nodes[idx].strip()
             if len(node_text) > chunk_size:
-                translated_text = split_text_recursive(node_text, chunk_size, translate_func)
+                translated_text = split_text_recursive(
+                    node_text, chunk_size, translate_func
+                )
                 text_nodes[idx].replace_with(translated_text)
                 idx += 1
                 continue
diff --git a/paddlex/inference/pipelines/table_recognition/pipeline_v2.py b/paddlex/inference/pipelines/table_recognition/pipeline_v2.py
index c7615d9a34..7da771a8ce 100644
--- a/paddlex/inference/pipelines/table_recognition/pipeline_v2.py
+++ b/paddlex/inference/pipelines/table_recognition/pipeline_v2.py
@@ -685,7 +685,12 @@ def split_box_by_cells(ocr_box, cell_indices, cells):
                 split_texts = []
                 for box in split_boxes:
                     x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
-                    if y2 - y1 > 1 and x2 - x1 > 1:
+                    if (
+                        y2 - y1 > 1
+                        and x2 - x1 > 1
+                        and y1 < ori_img.shape[0]
+                        and x1 < ori_img.shape[1]
+                    ):
                         ocr_result = list(
                             self.general_ocr_pipeline.text_rec_model(
                                 ori_img[y1:y2, x1:x2, :]
diff --git a/paddlex/inference/pipelines/text_to_speech/__init__.py b/paddlex/inference/pipelines/text_to_speech/__init__.py
new file mode 100644
index 0000000000..5ea98f433c
--- /dev/null
+++ b/paddlex/inference/pipelines/text_to_speech/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pipeline import TextToSpeechPipeline
diff --git a/paddlex/inference/pipelines/text_to_speech/pipeline.py b/paddlex/inference/pipelines/text_to_speech/pipeline.py
new file mode 100644
index 0000000000..7af9d83991
--- /dev/null
+++ b/paddlex/inference/pipelines/text_to_speech/pipeline.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from ...models.text_to_pinyin.result import TextToPinyinResult
+from ...models.text_to_speech_acoustic.result import Fastspeech2Result
+from ...models.text_to_speech_vocoder.result import PwganResult
+from ...utils.benchmark import benchmark
+from ...utils.hpi import HPIConfig
+from ...utils.pp_option import PaddlePredictorOption
+from ..base import BasePipeline
+
+
+@benchmark.time_methods
+class TextToSpeechPipeline(BasePipeline):
+    """Text to Speech Pipeline Pipeline"""
+
+    entities = "text_to_speech"
+
+    def __init__(
+        self,
+        config: Dict,
+        device: str = None,
+        pp_option: PaddlePredictorOption = None,
+        use_hpip: bool = False,
+        hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+    ) -> None:
+        """
+        Initializes the class with given configurations and options.
+
+        Args:
+            config (Dict): Configuration dictionary containing model and other parameters.
+            device (str): The device to run the prediction on. Default is None.
+            pp_option (PaddlePredictorOption): Options for PaddlePaddle predictor. Default is None.
+            use_hpip (bool, optional): Whether to use the high-performance
+                inference plugin (HPIP) by default. Defaults to False.
+            hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
+                The default high-performance inference configuration dictionary.
+                Defaults to None.
+        """
+        super().__init__(
+            device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_config=hpi_config
+        )
+
+        text_to_pinyin_model_config = config["SubModules"]["TextToPinyin"]
+        self.text_to_pinyin_model = self.create_model(text_to_pinyin_model_config)
+        text_to_speech_acoustic_model_config = config["SubModules"][
+            "TextToSpeechAcoustic"
+        ]
+        self.text_to_speech_acoustic_model = self.create_model(
+            text_to_speech_acoustic_model_config
+        )
+        text_to_speech_vocoder_model_config = config["SubModules"][
+            "TextToSpeechVocoder"
+        ]
+        self.text_to_speech_vocoder_model = self.create_model(
+            text_to_speech_vocoder_model_config
+        )
+
+    def predict(
+        self, input: Union[str, List[str], np.ndarray, List[np.ndarray]], **kwargs
+    ) -> PwganResult:
+        """Predicts speech recognition results for the given input.
+
+        Args:
+            input (Union[str, list[str], np.ndarray, list[np.ndarray]]): The input audio or path.
+            **kwargs: Additional keyword arguments that can be passed to the function.
+
+        Returns:
+            PwganResult: The predicted pwgan results, support str and json output.
+        """
+        sentences = []
+        if isinstance(input, str):
+            if input.endswith(".txt"):
+                if not os.path.exists(input):
+                    raise FileNotFoundError(
+                        f"The specified text file does not exist: {input}"
+                    )
+                try:
+                    with open(input, "r", encoding="utf-8") as f:
+                        sentences = [line.strip() for line in f.readlines()]
+                except IOError as e:
+                    raise IOError(
+                        f"An error occurred while reading the file {input}: {e}"
+                    )
+            else:
+                sentences = [input]
+        elif isinstance(input, list):
+            for item in input:
+                if isinstance(item, str):
+                    if item.endswith(".txt"):
+                        if not os.path.exists(item):
+                            raise FileNotFoundError(
+                                f"The specified text file in the list does not exist: {item}"
+                            )
+                        try:
+                            with open(item, "r", encoding="utf-8") as f:
+                                sentences.extend(
+                                    [line.strip() for line in f.readlines()]
+                                )
+                        except IOError as e:
+                            raise IOError(
+                                f"An error occurred while reading the file {item}: {e}"
+                            )
+                    else:
+                        sentences.append(item)
+        else:
+            raise TypeError(
+                f"Unsupported input type: {type(input)}. Expected str, list, or np.ndarray."
+            )
+        if not sentences:
+            raise ValueError(
+                "The input resulted in an empty list of sentences to process."
+            )
+
+        for sentence in sentences:
+            text_to_pinyin_res = [
+                self.get_text_to_pinyin_result(sentence)["result"]["phone_ids"]
+            ]
+            text_to_speech_acoustic_res = [
+                self.get_text_to_speech_acoustic_result(text_to_pinyin_res)["result"]
+            ]
+            yield from self.text_to_speech_vocoder_model(text_to_speech_acoustic_res)
+
+    def get_text_to_pinyin_result(
+        self, input: Union[str, List[str]]
+    ) -> TextToPinyinResult:
+        """Get the result of text to pinyin conversion.
+
+        Args:
+            input (Union[str, list[str]]): The input text or list of texts.
+
+        Returns:
+            TextToPinyinResult: The result of text to pinyin conversion.
+        """
+        return next(self.text_to_pinyin_model(input))
+
+    def get_text_to_speech_acoustic_result(
+        self, input: Union[str, List[str]]
+    ) -> Fastspeech2Result:
+        """Get the result of text to speech acoustic conversion.
+
+        Args:
+            input (Union[str, list[str]]): The input text or list of texts.
+
+        Returns:
+            Fastspeech2Result: The result of text to speech acoustic conversion.
+        """
+        return next(self.text_to_speech_acoustic_model(input))
diff --git a/paddlex/inference/serving/basic_serving/_app.py b/paddlex/inference/serving/basic_serving/_app.py
index 0a28d029b1..6dcdfbc99e 100644
--- a/paddlex/inference/serving/basic_serving/_app.py
+++ b/paddlex/inference/serving/basic_serving/_app.py
@@ -30,6 +30,7 @@
     TypeVar,
 )
 
+import pydantic
 from typing_extensions import ParamSpec, TypeGuard
 
 from ....utils import logging
@@ -195,15 +196,55 @@ async def _check_health() -> AIStudioNoResultResponse:
             logId=generate_log_id(), errorCode=0, errorMsg="Healthy"
         )
 
+    async def _try_get_log_id(request: fastapi.Request) -> Optional[str]:
+        try:
+            body = await request.json()
+        except Exception:
+            return None
+        if isinstance(body, dict) and "logId" in body:
+            return body["logId"]
+        return None
+
+    # Circumvent FastAPI bug: https://github.com/fastapi/fastapi/discussions/11923
+    # adapted from the Pydantic docs:
+    # https://docs.pydantic.dev/latest/errors/errors/#custom-errors
+    def _loc_to_dot_sep(loc: Tuple[str | int, ...]) -> str:
+        path = ""
+        for i, x in enumerate(loc):
+            if isinstance(x, str):
+                if i > 0:
+                    path += "."
+                path += x
+            elif isinstance(x, int):
+                path += f"[{x}]"
+            else:
+                raise TypeError("Unexpected type")
+        return path
+
+    def _convert_validation_errors(
+        validation_error: pydantic.ValidationError | RequestValidationError,
+    ) -> List[Dict[str, Any]]:
+        converted_errors = []
+        for error in validation_error.errors():
+            converted_error = {
+                "type": error["type"],
+                "loc": _loc_to_dot_sep(error["loc"]),
+                "msg": error["msg"],
+            }
+            converted_errors.append(converted_error)
+        return converted_errors
+
     @app.exception_handler(RequestValidationError)
     async def _validation_exception_handler(
         request: fastapi.Request, exc: RequestValidationError
     ) -> JSONResponse:
+        log_id = await _try_get_log_id(request) or generate_log_id()
+        errors = _convert_validation_errors(exc)
         json_compatible_data = jsonable_encoder(
             AIStudioNoResultResponse(
-                logId=generate_log_id(),
+                logId=log_id,
                 errorCode=422,
-                errorMsg=json.dumps(exc.errors()),
+                errorMsg=json.dumps(errors),
             )
         )
         return JSONResponse(content=json_compatible_data, status_code=422)
@@ -212,9 +253,10 @@ async def _validation_exception_handler(
     async def _http_exception_handler(
         request: fastapi.Request, exc: HTTPException
     ) -> JSONResponse:
+        log_id = await _try_get_log_id(request) or generate_log_id()
         json_compatible_data = jsonable_encoder(
             AIStudioNoResultResponse(
-                logId=generate_log_id(), errorCode=exc.status_code, errorMsg=exc.detail
+                logId=log_id, errorCode=exc.status_code, errorMsg=exc.detail
             )
         )
         return JSONResponse(content=json_compatible_data, status_code=exc.status_code)
@@ -223,12 +265,14 @@ async def _http_exception_handler(
     async def _unexpected_exception_handler(
         request: fastapi.Request, exc: Exception
     ) -> JSONResponse:
+        # FIXME: Request body is not available here, and the log ID cannot be retrieved.
+        log_id = await _try_get_log_id(request) or generate_log_id()
         # XXX: The default server will duplicate the error message. Is it
         # necessary to log the exception info here?
         logging.exception("Unhandled exception")
         json_compatible_data = jsonable_encoder(
             AIStudioNoResultResponse(
-                logId=generate_log_id(),
+                logId=log_id,
                 errorCode=500,
                 errorMsg="Internal server error",
             )
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py
index f940933524..dea25cbe38 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py
@@ -17,26 +17,18 @@
 
 from .....utils.deps import function_requires_deps, is_dep_available
 from ...infra.config import create_app_config
+from ...infra.name_mappings import pipeline_name_to_mod_name
 
 if is_dep_available("fastapi"):
     from fastapi import FastAPI
 
 
-def _pipeline_name_to_mod_name(pipeline_name: str) -> str:
-    if not pipeline_name:
-        raise ValueError("Empty pipeline name")
-    mod_name = pipeline_name.lower().replace("-", "_")
-    if mod_name[0].isdigit():
-        return "m_" + mod_name
-    return mod_name
-
-
 # XXX: A dynamic approach is used here for writing fewer lines of code, at the
 # cost of sacrificing some benefits of type hints.
 @function_requires_deps("fastapi")
 def create_pipeline_app(pipeline: Any, pipeline_config: Dict[str, Any]) -> "FastAPI":
     pipeline_name = pipeline_config["pipeline_name"]
-    mod_name = _pipeline_name_to_mod_name(pipeline_name)
+    mod_name = pipeline_name_to_mod_name(pipeline_name)
     mod = importlib.import_module(f".{mod_name}", package=__package__)
     app_config = create_app_config(pipeline_config)
     app_creator = getattr(mod, "create_pipeline_app")
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py
index d176631f50..1d46eadb20 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py
@@ -17,6 +17,7 @@
 import numpy as np
 from typing_extensions import Literal
 
+from ......utils import logging
 from ......utils.deps import function_requires_deps, is_dep_available
 from ....infra import utils as serving_utils
 from ....infra.models import ImageInfo, PDFInfo
@@ -79,17 +80,19 @@ async def get_images(
     request: BaseInferRequest, app_context: AppContext
 ) -> Tuple[List[np.ndarray], Union[ImageInfo, PDFInfo]]:
     file_type = get_file_type(request)
-    # XXX: Should we return 422?
-
-    file_bytes = await serving_utils.get_raw_bytes_async(
-        request.file,
-        app_context.aiohttp_session,
-    )
-    images, data_info = await serving_utils.call_async(
-        serving_utils.file_to_images,
-        file_bytes,
-        file_type,
-        max_num_imgs=app_context.extra["max_num_input_imgs"],
-    )
 
+    try:
+        file_bytes = await serving_utils.get_raw_bytes_async(
+            request.file,
+            app_context.aiohttp_session,
+        )
+        images, data_info = await serving_utils.call_async(
+            serving_utils.file_to_images,
+            file_bytes,
+            file_type,
+            max_num_imgs=app_context.extra["max_num_input_imgs"],
+        )
+    except Exception as e:
+        logging.error("Failed to read input file: %s", e)
+        raise HTTPException(status_code=422, detail="Invalid input file") from e
     return images, data_info
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py
index 73c1ccffba..8aa40d4ae4 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py
@@ -72,7 +72,7 @@ async def _infer(request: InferRequest) -> AIStudioResultResponse[InferResult]:
             output_image_base64 = None
 
         return AIStudioResultResponse[InferResult](
-            logId=serving_utils.generate_log_id(),
+            logId=request.logId if request.logId else serving_utils.generate_log_id(),
             result=InferResult(detectedObjects=objects, image=output_image_base64),
         )
 
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py
index ef18f5b012..4020a4f48b 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py
@@ -43,7 +43,7 @@ def create_pipeline_app(pipeline: Any, app_config: AppConfig) -> "FastAPI":
     async def _infer(request: InferRequest) -> AIStudioResultResponse[InferResult]:
         pipeline = ctx.pipeline
 
-        log_id = serving_utils.generate_log_id()
+        log_id = request.logId if request.logId else serving_utils.generate_log_id()
         visualize_enabled = (
             request.visualize if request.visualize is not None else ctx.config.visualize
         )
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py
index 05b4f0f5ee..1eaa219d86 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py
@@ -18,7 +18,14 @@
 from ...infra import utils as serving_utils
 from ...infra.config import AppConfig
 from ...infra.models import AIStudioResultResponse
-from ...schemas.paddleocr_vl import INFER_ENDPOINT, InferRequest, InferResult
+from ...schemas.paddleocr_vl import (
+    INFER_ENDPOINT,
+    RESTRUCTURE_PAGES_ENDPOINT,
+    InferRequest,
+    InferResult,
+    RestructurePagesRequest,
+    RestructurePagesResult,
+)
 from .._app import create_app, primary_operation
 from ._common import common
 from ._common import ocr as ocr_common
@@ -45,7 +52,7 @@ async def _infer(
     ) -> AIStudioResultResponse[InferResult]:
         pipeline = ctx.pipeline
 
-        log_id = serving_utils.generate_log_id()
+        log_id = request.logId if request.logId else serving_utils.generate_log_id()
         visualize_enabled = (
             request.visualize if request.visualize is not None else ctx.config.visualize
         )
@@ -57,10 +64,13 @@ async def _infer(
             use_doc_unwarping=request.useDocUnwarping,
             use_layout_detection=request.useLayoutDetection,
             use_chart_recognition=request.useChartRecognition,
+            use_seal_recognition=request.useSealRecognition,
+            use_ocr_for_image_block=request.useOcrForImageBlock,
             layout_threshold=request.layoutThreshold,
             layout_nms=request.layoutNms,
             layout_unclip_ratio=request.layoutUnclipRatio,
             layout_merge_bboxes_mode=request.layoutMergeBboxesMode,
+            layout_shape_mode=request.layoutShapeMode,
             prompt_label=request.promptLabel,
             format_block_content=request.formatBlockContent,
             repetition_penalty=request.repetitionPenalty,
@@ -68,8 +78,23 @@ async def _infer(
             top_p=request.topP,
             min_pixels=request.minPixels,
             max_pixels=request.maxPixels,
+            max_new_tokens=request.maxNewTokens,
+            merge_layout_blocks=request.mergeLayoutBlocks,
+            markdown_ignore_labels=request.markdownIgnoreLabels,
+            vlm_extra_args=request.vlmExtraArgs,
         )
 
+        orig_result = result
+        if request.restructurePages:
+            result = await serving_utils.call_async(
+                pipeline.pipeline.restructure_pages,
+                result,
+                merge_tables=request.mergeTables,
+                relevel_titles=request.relevelTitles,
+                concatenate_pages=False,
+            )
+            result = list(result)
+
         layout_parsing_results: List[Dict[str, Any]] = []
         for i, (img, item) in enumerate(zip(images, result)):
             pruned_res = common.prune_result(item.json["res"])
@@ -128,4 +153,78 @@ async def _infer(
             ),
         )
 
+    @primary_operation(
+        app,
+        RESTRUCTURE_PAGES_ENDPOINT,
+        "restructurePages",
+    )
+    async def _restructure_pages(
+        request: RestructurePagesRequest,
+    ) -> AIStudioResultResponse[RestructurePagesResult]:
+        def _to_original_result(pruned_res, page_index):
+            res = {**pruned_res, "input_path": "", "page_index": page_index}
+            orig_res = {"res": res}
+            return orig_res
+
+        pipeline = ctx.pipeline
+
+        log_id = request.logId if request.logId else serving_utils.generate_log_id()
+
+        original_results = []
+        markdown_images = {}
+        for i, page in enumerate(request.pages):
+            orig_res = _to_original_result(page.prunedResult, i)
+            original_results.append(orig_res)
+            if request.concatenatePages:
+                markdown_images.update(page.markdownImages)
+
+        restructured_results = await serving_utils.call_async(
+            pipeline.pipeline.restructure_pages,
+            original_results,
+            merge_tables=request.mergeTables,
+            relevel_titles=request.relevelTitles,
+            concatenate_pages=request.concatenatePages,
+        )
+        restructured_results = list(restructured_results)
+
+        layout_parsing_results = []
+        if request.concatenatePages:
+            layout_parsing_result = {}
+            layout_parsing_result["prunedResult"] = common.prune_result(
+                restructured_results[0].json["res"]
+            )
+            # XXX
+            md_data = restructured_results[0]._to_markdown(
+                pretty=request.prettifyMarkdown,
+                show_formula_number=request.showFormulaNumber,
+            )
+            layout_parsing_result["markdown"] = dict(
+                text=md_data["markdown_texts"],
+                images=markdown_images,
+            )
+            layout_parsing_results.append(layout_parsing_result)
+        else:
+            for new_res, old_page in zip(restructured_results, request.pages):
+                layout_parsing_result = {}
+                layout_parsing_result["prunedResult"] = common.prune_result(
+                    new_res.json["res"]
+                )
+                # XXX
+                md_data = new_res._to_markdown(
+                    pretty=request.prettifyMarkdown,
+                    show_formula_number=request.showFormulaNumber,
+                )
+                layout_parsing_result["markdown"] = dict(
+                    text=md_data["markdown_texts"],
+                    images=old_page.markdownImages,
+                )
+                layout_parsing_results.append(layout_parsing_result)
+
+        return AIStudioResultResponse[RestructurePagesResult](
+            logId=log_id,
+            result=RestructurePagesResult(
+                layoutParsingResults=layout_parsing_results,
+            ),
+        )
+
     return app
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py
index 3b3a9a660f..79e1e17ee6 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py
@@ -45,7 +45,7 @@ async def _infer(
     ) -> AIStudioResultResponse[InferResult]:
         pipeline = ctx.pipeline
 
-        log_id = serving_utils.generate_log_id()
+        log_id = request.logId if request.logId else serving_utils.generate_log_id()
         visualize_enabled = (
             request.visualize if request.visualize is not None else ctx.config.visualize
         )
@@ -84,12 +84,17 @@ async def _infer(
             use_ocr_results_with_table_cells=request.useOcrResultsWithTableCells,
             use_e2e_wired_table_rec_model=request.useE2eWiredTableRecModel,
             use_e2e_wireless_table_rec_model=request.useE2eWirelessTableRecModel,
+            markdown_ignore_labels=request.markdownIgnoreLabels,
         )
 
         layout_parsing_results: List[Dict[str, Any]] = []
         for i, (img, item) in enumerate(zip(images, result)):
             pruned_res = common.prune_result(item.json["res"])
-            md_data = item.markdown
+            # XXX
+            md_data = item._to_markdown(
+                pretty=request.prettifyMarkdown,
+                show_formula_number=request.showFormulaNumber,
+            )
             md_text = md_data["markdown_texts"]
             md_imgs = await serving_utils.call_async(
                 common.postprocess_images,
diff --git a/paddlex/inference/serving/infra/name_mappings.py b/paddlex/inference/serving/infra/name_mappings.py
new file mode 100644
index 0000000000..b8a6210968
--- /dev/null
+++ b/paddlex/inference/serving/infra/name_mappings.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PIPELINE_APP_ROUTER = {
+    "PaddleOCR-VL-1.5": "PaddleOCR-VL",
+}
+
+
+def pipeline_name_to_mod_name(pipeline_name: str) -> str:
+    if not pipeline_name:
+        raise ValueError("Empty pipeline name")
+    if pipeline_name in PIPELINE_APP_ROUTER:
+        pipeline_name = PIPELINE_APP_ROUTER[pipeline_name]
+    mod_name = pipeline_name.lower().replace("-", "_").replace(".", "")
+    if mod_name[0].isdigit():
+        return "m_" + mod_name
+    return mod_name
diff --git a/paddlex/inference/serving/infra/utils.py b/paddlex/inference/serving/infra/utils.py
index ab1d0def71..b6b0211f98 100644
--- a/paddlex/inference/serving/infra/utils.py
+++ b/paddlex/inference/serving/infra/utils.py
@@ -18,7 +18,6 @@
 import mimetypes
 import re
 import tempfile
-import threading
 import uuid
 from functools import partial
 from typing import Awaitable, Callable, List, Optional, Tuple, TypeVar, Union, overload
@@ -31,6 +30,7 @@
 from typing_extensions import Literal, ParamSpec, TypeAlias, assert_never
 
 from ....utils.deps import function_requires_deps, is_dep_available
+from ...utils.pdfium_lock import pdfium_lock
 from .models import ImageInfo, PDFInfo, PDFPageInfo
 
 if is_dep_available("aiohttp"):
@@ -177,20 +177,19 @@ def base64_encode(data: bytes) -> str:
     return base64.b64encode(data).decode("ascii")
 
 
-_lock = threading.Lock()
-
-
 @function_requires_deps("pypdfium2", "opencv-contrib-python")
 def read_pdf(
     bytes_: bytes, max_num_imgs: Optional[int] = None
 ) -> Tuple[List[np.ndarray], PDFInfo]:
     images: List[np.ndarray] = []
     page_info_list: List[PDFPageInfo] = []
-    with _lock:
+    with pdfium_lock:
         doc = pdfium.PdfDocument(bytes_)
+        doc.init_forms()
         try:
             for page in doc:
                 if max_num_imgs is not None and len(images) >= max_num_imgs:
+                    page.close()
                     break
                 # TODO: Do not always use zoom=2.0
                 zoom = 2.0
@@ -202,6 +201,7 @@ def read_pdf(
                     height=image.shape[0],
                 )
                 page_info_list.append(page_info)
+                page.close()
         finally:
             doc.close()
     pdf_info = PDFInfo(
diff --git a/paddlex/inference/serving/schemas/__init__.py b/paddlex/inference/serving/schemas/__init__.py
index 27de2e2d19..6537be3995 100644
--- a/paddlex/inference/serving/schemas/__init__.py
+++ b/paddlex/inference/serving/schemas/__init__.py
@@ -11,3 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import importlib
+from types import ModuleType
+
+from ..infra.name_mappings import pipeline_name_to_mod_name
+
+
+def get_pipeline_schema_mod(pipeline_name: str) -> ModuleType:
+    mod_name = pipeline_name_to_mod_name(pipeline_name)
+    mod = importlib.import_module(f".{mod_name}", package=__package__)
+    return mod
diff --git a/paddlex/inference/serving/schemas/object_detection.py b/paddlex/inference/serving/schemas/object_detection.py
index a5a93ef4bb..3d431cad36 100644
--- a/paddlex/inference/serving/schemas/object_detection.py
+++ b/paddlex/inference/serving/schemas/object_detection.py
@@ -34,6 +34,7 @@ class InferRequest(BaseModel):
     image: str
     threshold: Optional[Union[float, Dict[int, float]]] = None
     visualize: Optional[bool] = None
+    logId: Optional[str] = None
 
 
 class DetectedObject(BaseModel):
diff --git a/paddlex/inference/serving/schemas/ocr.py b/paddlex/inference/serving/schemas/ocr.py
index 3d0e34c108..143ee49b67 100644
--- a/paddlex/inference/serving/schemas/ocr.py
+++ b/paddlex/inference/serving/schemas/ocr.py
@@ -43,6 +43,7 @@ class InferRequest(ocr.BaseInferRequest):
     textRecScoreThresh: Optional[float] = None
     returnWordBox: Optional[bool] = None
     visualize: Optional[bool] = None
+    logId: Optional[str] = None
 
 
 class OCRResult(BaseModel):
diff --git a/paddlex/inference/serving/schemas/paddleocr_vl.py b/paddlex/inference/serving/schemas/paddleocr_vl.py
index 0839fd3094..9f439c2148 100644
--- a/paddlex/inference/serving/schemas/paddleocr_vl.py
+++ b/paddlex/inference/serving/schemas/paddleocr_vl.py
@@ -14,7 +14,8 @@
 
 from typing import Dict, Final, List, Optional, Tuple, Union
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing_extensions import Annotated, Literal
 
 from ..infra.models import DataInfo, PrimaryOperations
 from .shared import ocr
@@ -24,10 +25,16 @@
     "InferRequest",
     "LayoutParsingResult",
     "InferResult",
+    "RESTRUCTURE_PAGES_ENDPOINT",
+    "RestructurePagesRequest",
+    "RestructurePagesResult",
     "PRIMARY_OPERATIONS",
+    "MarkdownData",
+    "Page",
 ]
 
 INFER_ENDPOINT: Final[str] = "/layout-parsing"
+RESTRUCTURE_PAGES_ENDPOINT: Final[str] = "/restructure-pages"
 
 
 class InferRequest(ocr.BaseInferRequest):
@@ -35,20 +42,168 @@ class InferRequest(ocr.BaseInferRequest):
     useDocUnwarping: Optional[bool] = None
     useLayoutDetection: Optional[bool] = None
     useChartRecognition: Optional[bool] = None
+    useSealRecognition: Optional[bool] = None
+    useOcrForImageBlock: Optional[bool] = None
     layoutThreshold: Optional[Union[float, dict]] = None
     layoutNms: Optional[bool] = None
     layoutUnclipRatio: Optional[Union[float, Tuple[float, float], dict]] = None
     layoutMergeBboxesMode: Optional[Union[str, dict]] = None
+    layoutShapeMode: Literal["rect", "quad", "poly", "auto"] = "auto"
     promptLabel: Optional[str] = None
     formatBlockContent: Optional[bool] = None
     repetitionPenalty: Optional[float] = None
     temperature: Optional[float] = None
     topP: Optional[float] = None
-    minPixels: Optional[int] = None
-    maxPixels: Optional[int] = None
+    minPixels: Optional[Annotated[int, Field(gt=0)]] = None
+    maxPixels: Optional[Annotated[int, Field(gt=0)]] = None
+    maxNewTokens: Optional[Annotated[int, Field(gt=0)]] = None
+    mergeLayoutBlocks: Optional[bool] = None
+    markdownIgnoreLabels: Optional[List[str]] = None
+    vlmExtraArgs: Optional[dict] = None
     prettifyMarkdown: bool = True
     showFormulaNumber: bool = False
+    restructurePages: bool = False
+    mergeTables: bool = True
+    relevelTitles: bool = True
     visualize: Optional[bool] = None
+    logId: Optional[str] = None
+
+    @field_validator("topP")
+    @classmethod
+    def validate_top_p(cls, v):
+        if v is not None and not (0 < v <= 1):
+            raise ValueError(f"`topP` must be > 0 and ≤ 1; got: {v}")
+        return v
+
+    @field_validator("temperature")
+    @classmethod
+    def validate_temperature(cls, v):
+        if v is not None and v < 0:
+            raise ValueError(f"`temperature` must be ≥ 0; got: {v}")
+        return v
+
+    @field_validator("repetitionPenalty")
+    @classmethod
+    def validate_repetition_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValueError(f"`repetitionPenalty` must be > 0; got: {v}")
+        return v
+
+    @field_validator("promptLabel")
+    @classmethod
+    def validate_prompt_label(cls, v):
+        _ALLOWED_PROMPT_LABELS = (
+            "ocr",
+            "formula",
+            "table",
+            "chart",
+            "seal",
+            "spotting",
+        )
+        if v is not None and v not in _ALLOWED_PROMPT_LABELS:
+            valid_values = ", ".join(_ALLOWED_PROMPT_LABELS)
+            raise ValueError(f"`promptLabel` must be one of: {valid_values}; got: {v}")
+        return v
+
+    @field_validator("layoutMergeBboxesMode")
+    @classmethod
+    def validate_merge_bboxes_mode(cls, v):
+        _ALLOWED_MERGE_BBOXES_MODES = ("large", "small", "union")
+
+        if v is None:
+            return v
+
+        if isinstance(v, str):
+            if v not in _ALLOWED_MERGE_BBOXES_MODES:
+                raise ValueError(
+                    f"`layoutMergeBboxesMode` must be one of: {', '.join(_ALLOWED_MERGE_BBOXES_MODES)}; got: {v}"
+                )
+        elif isinstance(v, dict):
+            for key, value in v.items():
+                if not isinstance(value, str):
+                    raise ValueError(
+                        f"`layoutMergeBboxesMode` dictionary values must be strings; got: {type(value).__name__}"
+                    )
+                if value not in _ALLOWED_MERGE_BBOXES_MODES:
+                    raise ValueError(
+                        f"`layoutMergeBboxesMode` dictionary value must be one of: {', '.join(_ALLOWED_MERGE_BBOXES_MODES)}; got: {value}"
+                    )
+        else:
+            raise ValueError("`layoutMergeBboxesMode` must be a string or dictionary")
+
+        return v
+
+    @field_validator("layoutUnclipRatio")
+    @classmethod
+    def validate_unclip_ratio(cls, v):
+        if v is None:
+            return v
+
+        def _validate_ratio_value(value, context=""):
+            if isinstance(value, (int, float)):
+                if value <= 0:
+                    raise ValueError(
+                        f"`layoutUnclipRatio`{context} must be > 0; got: {value}"
+                    )
+            elif isinstance(value, list):
+                if len(value) != 2:
+                    raise ValueError(
+                        f"`layoutUnclipRatio`{context} must be two numbers; got: {len(value)} values"
+                    )
+                for i, item in enumerate(value):
+                    if not isinstance(item, (int, float)):
+                        raise ValueError(
+                            f"`layoutUnclipRatio`{context} values must be numbers; got: {type(item).__name__} at position {i}"
+                        )
+                    if item <= 0:
+                        raise ValueError(
+                            f"`layoutUnclipRatio`{context} values must be > 0; got: {item} at position {i}"
+                        )
+            else:
+                raise ValueError(
+                    f"`layoutUnclipRatio`{context} must be a number or two numbers; got: {type(value).__name__}"
+                )
+
+        if isinstance(v, dict):
+            for key, value in v.items():
+                _validate_ratio_value(value, f" value for key '{key}'")
+        else:
+            _validate_ratio_value(v)
+
+        return v
+
+    @field_validator("layoutThreshold")
+    @classmethod
+    def validate_threshold(cls, v):
+        if v is None:
+            return v
+
+        def _validate_threshold_value(value, context=""):
+            if not isinstance(value, (int, float)):
+                raise ValueError(
+                    f"`layoutThreshold`{context} must be a number; got: {type(value).__name__}"
+                )
+            if value < 0 or value > 1:
+                raise ValueError(
+                    f"`layoutThreshold`{context} must be between 0 and 1 inclusive; got: {value}"
+                )
+
+        if isinstance(v, dict):
+            for key, value in v.items():
+                _validate_threshold_value(value, f" value for key '{key}'")
+        else:
+            _validate_threshold_value(v)
+
+        return v
+
+    @model_validator(mode="after")
+    def validate_pixel_range(self):
+        if self.minPixels is not None and self.maxPixels is not None:
+            if self.minPixels > self.maxPixels:
+                raise ValueError(
+                    f"`minPixels` ({self.minPixels}) cannot be greater than `maxPixels` ({self.maxPixels})"
+                )
+        return self
 
 
 class MarkdownData(BaseModel):
@@ -68,6 +223,30 @@ class InferResult(BaseModel):
     dataInfo: DataInfo
 
 
+class Page(BaseModel):
+    prunedResult: dict
+    markdownImages: Optional[Dict[str, str]] = None
+
+
+class RestructurePagesRequest(BaseModel):
+    pages: List[Page]
+    mergeTables: bool = True
+    relevelTitles: bool = True
+    concatenatePages: bool = False
+    prettifyMarkdown: bool = True
+    showFormulaNumber: bool = False
+    logId: Optional[str] = None
+
+
+class RestructurePagesResult(BaseModel):
+    layoutParsingResults: List[LayoutParsingResult]
+
+
 PRIMARY_OPERATIONS: Final[PrimaryOperations] = {
     "infer": (INFER_ENDPOINT, InferRequest, InferResult),
+    "restructure-pages": (
+        RESTRUCTURE_PAGES_ENDPOINT,
+        RestructurePagesRequest,
+        RestructurePagesResult,
+    ),
 }
diff --git a/paddlex/inference/serving/schemas/pp_structurev3.py b/paddlex/inference/serving/schemas/pp_structurev3.py
index 9bf1134b49..b1dd9b08bb 100644
--- a/paddlex/inference/serving/schemas/pp_structurev3.py
+++ b/paddlex/inference/serving/schemas/pp_structurev3.py
@@ -62,7 +62,11 @@ class InferRequest(ocr.BaseInferRequest):
     useOcrResultsWithTableCells: bool = True
     useE2eWiredTableRecModel: bool = False
     useE2eWirelessTableRecModel: bool = True
+    markdownIgnoreLabels: Optional[List[str]] = None
+    prettifyMarkdown: bool = True
+    showFormulaNumber: bool = False
     visualize: Optional[bool] = None
+    logId: Optional[str] = None
 
 
 class LayoutParsingResult(BaseModel):
diff --git a/paddlex/inference/utils/benchmark.py b/paddlex/inference/utils/benchmark.py
index 55f1eefc37..c1afa89b41 100644
--- a/paddlex/inference/utils/benchmark.py
+++ b/paddlex/inference/utils/benchmark.py
@@ -627,6 +627,10 @@ def set_inference_operations(val):
     _inference_operations = val
 
 
+def add_inference_operations(*ops):
+    _inference_operations.extend(ops)
+
+
 if INFER_BENCHMARK or PIPELINE_BENCHMARK:
     benchmark = Benchmark(enabled=True)
 else:
diff --git a/paddlex/inference/utils/hpi.py b/paddlex/inference/utils/hpi.py
index 613fe4a293..ec8e958654 100644
--- a/paddlex/inference/utils/hpi.py
+++ b/paddlex/inference/utils/hpi.py
@@ -24,6 +24,7 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated, TypeAlias
 
+from ...utils import logging
 from ...utils.deps import function_requires_deps, is_paddle2onnx_plugin_available
 from ...utils.env import get_paddle_cuda_version, get_paddle_version
 from ...utils.flags import USE_PIR_TRT
@@ -156,6 +157,14 @@ def suggest_inference_backend_and_config(
         return None, f"Inference backend {repr(hpi_config.backend)} is unavailable."
 
     paddle_version = get_paddle_version()
+
+    if paddle_version[:3] >= (3, 1, 0):
+        logging.debug(
+            "Paddle version %s is not supported yet. The prior knowledge of Paddle 3.1.1 will be used.",
+            paddle_version,
+        )
+        paddle_version = (3, 1, 1, None)
+
     if (3, 0) <= paddle_version[:2] <= (3, 1) and paddle_version[3] is None:
         if paddle_version[2] == 0:
             paddle_version = f"paddle{paddle_version[0]}{paddle_version[1]}"
diff --git a/paddlex/inference/utils/hpi_model_info_collection.json b/paddlex/inference/utils/hpi_model_info_collection.json
index 78835aa809..a9241ea6bf 100644
--- a/paddlex/inference/utils/hpi_model_info_collection.json
+++ b/paddlex/inference/utils/hpi_model_info_collection.json
@@ -5135,7 +5135,6 @@
         "onnxruntime"
       ],
       "PP-OCRv4_server_det": [
-        "tensorrt",
         "onnxruntime",
         "paddle"
       ],
@@ -5822,7 +5821,6 @@
         "paddle"
       ],
       "PP-OCRv5_server_det": [
-        "tensorrt",
         "paddle"
       ],
       "PP-OCRv5_mobile_det": [
@@ -6563,7 +6561,6 @@
         "onnxruntime"
       ],
       "PP-OCRv4_server_det": [
-        "tensorrt",
         "onnxruntime",
         "paddle"
       ],
@@ -7258,7 +7255,6 @@
         "paddle"
       ],
       "PP-OCRv5_server_det": [
-        "tensorrt",
         "onnxruntime",
         "paddle"
       ],
@@ -8005,7 +8001,6 @@
         "onnxruntime"
       ],
       "PP-OCRv4_server_det": [
-        "tensorrt",
         "onnxruntime",
         "paddle"
       ],
@@ -8699,7 +8694,6 @@
         "paddle"
       ],
       "PP-OCRv5_server_det": [
-        "tensorrt",
         "onnxruntime",
         "paddle"
       ],
@@ -11850,6 +11844,10 @@
         "onnxruntime",
         "paddle"
       ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
       "arabic_PP-OCRv5_mobile_rec": [
         "onnxruntime",
         "paddle"
diff --git a/paddlex/inference/utils/io/__init__.py b/paddlex/inference/utils/io/__init__.py
index 8b66633b00..2fa44ae29f 100644
--- a/paddlex/inference/utils/io/__init__.py
+++ b/paddlex/inference/utils/io/__init__.py
@@ -24,6 +24,7 @@
     YAMLReader,
 )
 from .writers import (
+    AudioWriter,
     CSVWriter,
     HtmlWriter,
     ImageWriter,
diff --git a/paddlex/inference/utils/io/readers.py b/paddlex/inference/utils/io/readers.py
index ec22d11d22..322fbfc233 100644
--- a/paddlex/inference/utils/io/readers.py
+++ b/paddlex/inference/utils/io/readers.py
@@ -28,6 +28,7 @@
     import cv2
 if is_dep_available("pypdfium2"):
     import pypdfium2 as pdfium
+    from ..pdfium_lock import pdfium_lock
 if is_dep_available("soundfile"):
     import soundfile
 
@@ -102,7 +103,10 @@ def __init__(self, backend="pypdfium2", **bk_args):
         super().__init__(backend, **bk_args)
 
     def read(self, in_path):
-        yield from self._backend.read_file(str(in_path))
+        yield from self._backend.read_file(in_path)
+
+    def load(self, in_path):
+        return self._backend.load_file(str(in_path))
 
     def _init_backend(self, bk_type, bk_args):
         return PDFReaderBackend(**bk_args)
@@ -291,13 +295,26 @@ def __init__(self, rotate=0, zoom=2.0):
         self._rotation = rotate
         self._scale = zoom
 
+    def load_file(self, in_path):
+        """load pdf file"""
+        with pdfium_lock:
+            doc = pdfium.PdfDocument(in_path)
+            doc.init_forms()
+            return doc
+
     def read_file(self, in_path):
-        doc = pdfium.PdfDocument(in_path)
-        try:
-            for page in doc:
-                yield page.render(scale=self._scale, rotation=self._rotation).to_numpy()
-        finally:
-            doc.close()
+        with pdfium_lock:
+            if isinstance(in_path, pdfium.PdfDocument):
+                doc = in_path
+            else:
+                doc = self.load_file(str(in_path))
+            try:
+                for page in doc:
+                    image = page.render(scale=self._scale, rotation=self._rotation).to_numpy()
+                    page.close()
+                    yield image
+            finally:
+                doc.close()
 
 
 class TXTReaderBackend(_BaseReaderBackend):
diff --git a/paddlex/inference/utils/io/writers.py b/paddlex/inference/utils/io/writers.py
index 865e87ad56..824aea344c 100644
--- a/paddlex/inference/utils/io/writers.py
+++ b/paddlex/inference/utils/io/writers.py
@@ -28,6 +28,9 @@
 if is_dep_available("opencv-contrib-python"):
     import cv2
 
+if is_dep_available("soundfile"):
+    import soundfile as sf
+
 
 __all__ = [
     "WriterType",
@@ -56,6 +59,7 @@ class WriterType(enum.Enum):
     YAML = 8
     MARKDOWN = 9
     TXT = 10
+    AUDIO = 11
 
 
 class _BaseWriter(object):
@@ -261,6 +265,29 @@ def get_type(self):
         return WriterType.MARKDOWN
 
 
+class AudioWriter(_BaseWriter):
+    """AudioWriter"""
+
+    def __init__(self, sample_rate=24000, backend="wav", **bk_args):
+        super().__init__(sample_rate=sample_rate, backend=backend, **bk_args)
+        self.sample_rate = sample_rate
+
+    def write(self, out_path, obj):
+        """write"""
+        return self._backend.write_obj(str(out_path), obj)
+
+    def _init_backend(self, bk_type, bk_args):
+        """init backend"""
+        if bk_type == "wav":
+            return AudioWriterBackend(**bk_args)
+        else:
+            raise ValueError("Unsupported backend type")
+
+    def get_type(self):
+        """get type"""
+        return WriterType.AUDIO
+
+
 class _BaseWriterBackend(object):
     """_BaseWriterBackend"""
 
@@ -458,3 +485,16 @@ def _write_obj(self, out_path, obj):
         """write markdown obj"""
         with open(out_path, mode="w", encoding="utf-8", errors="replace") as f:
             f.write(obj)
+
+
+class AudioWriterBackend(_BaseWriterBackend):
+    """AudioWriterBackend"""
+
+    def __init__(self, sample_rate=24000):
+        super().__init__()
+        self.sample_rate = sample_rate
+
+    def _write_obj(self, out_path, obj):
+        """write audio obj"""
+        audio = obj["result"]
+        sf.write(out_path, audio, self.sample_rate)
diff --git a/paddlex/inference/utils/misc.py b/paddlex/inference/utils/misc.py
index bb6fe29ad2..4451098392 100644
--- a/paddlex/inference/utils/misc.py
+++ b/paddlex/inference/utils/misc.py
@@ -31,4 +31,23 @@ def is_bfloat16_available(device):
     device_type, _ = parse_device(device)
     return (
         "npu" in get_device_type() or paddle.amp.is_bfloat16_supported()
-    ) and device_type in ("gpu", "npu", "xpu", "mlu")
+    ) and device_type in ("gpu", "npu", "xpu", "mlu", "metax_gpu", "iluvatar_gpu")
+
+
+def is_float16_available(device):
+    import paddle.amp
+
+    if device is None:
+        device = get_default_device()
+    device_type, _ = parse_device(device)
+    return (
+        "npu" in get_device_type() or paddle.amp.is_float16_supported()
+    ) and device_type in (
+        "gpu",
+        "npu",
+        "xpu",
+        "mlu",
+        "dcu",
+        "metax_gpu",
+        "iluvatar_gpu",
+    )
diff --git a/paddlex/inference/utils/mkldnn_blocklist.py b/paddlex/inference/utils/mkldnn_blocklist.py
index 972a98f961..dfe4c8d72d 100644
--- a/paddlex/inference/utils/mkldnn_blocklist.py
+++ b/paddlex/inference/utils/mkldnn_blocklist.py
@@ -56,4 +56,6 @@
     "PP-FormulaNet_plus-L",
     "PP-FormulaNet_plus-M",
     "PP-FormulaNet_plus-S",
+    "PP-DocLayoutV2",
+    "PP-DocLayoutV3",
 ]
diff --git a/paddlex/inference/utils/model_paths.py b/paddlex/inference/utils/model_paths.py
index aad436e0eb..891c563024 100644
--- a/paddlex/inference/utils/model_paths.py
+++ b/paddlex/inference/utils/model_paths.py
@@ -23,6 +23,8 @@ class ModelPaths(TypedDict, total=False):
     paddle: Tuple[Path, Path]
     onnx: Path
     om: Path
+    paddle_dyn: Path
+    safetensors: Path
 
 
 def get_model_paths(
@@ -45,4 +47,8 @@ def get_model_paths(
         model_paths["onnx"] = model_dir / f"{model_file_prefix}.onnx"
     if (model_dir / f"{model_file_prefix}.om").exists():
         model_paths["om"] = model_dir / f"{model_file_prefix}.om"
+    if (model_dir / "model_state.pdparams").exists():
+        model_paths["paddle_dyn"] = model_dir / "model_state.pdparams"
+    if (model_dir / "model.safetensors").exists():
+        model_paths["safetensors"] = model_dir / "model.safetensors"
     return model_paths
diff --git a/paddlex/inference/utils/official_models.py b/paddlex/inference/utils/official_models.py
index 98447af01d..efb0d3c29d 100644
--- a/paddlex/inference/utils/official_models.py
+++ b/paddlex/inference/utils/official_models.py
@@ -31,7 +31,7 @@
 from ...utils import logging
 from ...utils.cache import CACHE_DIR
 from ...utils.download import download_and_extract
-from ...utils.flags import MODEL_SOURCE
+from ...utils.flags import DISABLE_MODEL_SOURCE_CHECK, MODEL_SOURCE, HUGGING_FACE_ENDPOINT
 
 ALL_MODELS = [
     "ResNet18",
@@ -322,6 +322,9 @@
     "ta_PP-OCRv5_mobile_rec",
     "devanagari_PP-OCRv5_mobile_rec",
     "cyrillic_PP-OCRv5_mobile_rec",
+    "G2PWModel",
+    "fastspeech2_csmsc",
+    "pwgan_csmsc",
 ]
 
 
@@ -478,12 +481,12 @@ def _download(self, model_name, save_dir):
 class _HuggingFaceModelHoster(_BaseModelHoster):
     model_list = OCR_MODELS
     alias = "huggingface"
-    healthcheck_url = "https://huggingface.co"
+    healthcheck_url = HUGGING_FACE_ENDPOINT
 
     def _download(self, model_name, save_dir):
         def _clone(local_dir):
             hf_hub.snapshot_download(
-                repo_id=f"PaddlePaddle/{model_name}", local_dir=local_dir
+                repo_id=f"PaddlePaddle/{model_name}", local_dir=local_dir, endpoint=HUGGING_FACE_ENDPOINT
             )
 
         if os.path.exists(save_dir):
@@ -541,18 +544,35 @@ def _clone(local_dir):
 class _ModelManager:
     model_list = ALL_MODELS
     _save_dir = Path(CACHE_DIR) / "official_models"
+    hoster_candidates = [
+        _HuggingFaceModelHoster,
+        _AIStudioModelHoster,
+        _ModelScopeModelHoster,
+        _BosModelHoster,
+    ]
 
     def __init__(self) -> None:
         self._hosters = self._build_hosters()
 
     def _build_hosters(self):
+
+        if DISABLE_MODEL_SOURCE_CHECK:
+            logging.warning(
+                f"Connectivity check to the model hoster has been skipped because `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` is enabled."
+            )
+            hosters = []
+            for hoster_cls in self.hoster_candidates:
+                if hoster_cls.alias == MODEL_SOURCE:
+                    hosters.insert(0, hoster_cls(self._save_dir))
+                else:
+                    hosters.append(hoster_cls(self._save_dir))
+            return hosters
+
+        logging.warning(
+            f"Checking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`."
+        )
         hosters = []
-        for hoster_cls in [
-            _HuggingFaceModelHoster,
-            _AIStudioModelHoster,
-            _ModelScopeModelHoster,
-            _BosModelHoster,
-        ]:
+        for hoster_cls in self.hoster_candidates:
             if hoster_cls.alias == MODEL_SOURCE:
                 if hoster_cls.is_available():
                     hosters.insert(0, hoster_cls(self._save_dir))
@@ -561,7 +581,7 @@ def _build_hosters(self):
                     hosters.append(hoster_cls(self._save_dir))
         if len(hosters) == 0:
             logging.warning(
-                f"No model hoster is available! Please check your network connection to one of the following model hosts: HuggingFace ({_HuggingFaceModelHoster.healthcheck_url}), ModelScope ({_ModelScopeModelHoster.healthcheck_url}), AIStudio ({_AIStudioModelHoster.healthcheck_url}), or BOS ({_BosModelHoster.healthcheck_url}). Otherwise, only local models can be used."
+                f"No model hoster is available! Please check your network connection to one of the following model hoster: HuggingFace ({_HuggingFaceModelHoster.healthcheck_url}), ModelScope ({_ModelScopeModelHoster.healthcheck_url}), AIStudio ({_AIStudioModelHoster.healthcheck_url}), or BOS ({_BosModelHoster.healthcheck_url}). Otherwise, only local models can be used."
             )
         return hosters
 
diff --git a/paddlex/inference/utils/pdfium_lock.py b/paddlex/inference/utils/pdfium_lock.py
new file mode 100644
index 0000000000..7e76d74a2a
--- /dev/null
+++ b/paddlex/inference/utils/pdfium_lock.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Global lock for pypdfium2 operations.
+
+PDFium is inherently not thread-safe. It is not allowed to call pdfium
+functions simultaneously across different threads, not even with different
+documents. Simultaneous pdfium calls across threads will crash or corrupt
+the process.
+
+See: https://pypdfium2.readthedocs.io/en/stable/python_api.html
+
+This module provides a global lock that must be used to serialize all
+pypdfium2 operations across the application.
+"""
+
+import threading
+
+pdfium_lock = threading.Lock()
diff --git a/paddlex/inference/utils/pp_option.py b/paddlex/inference/utils/pp_option.py
index 07c7a0b91e..366174c7c9 100644
--- a/paddlex/inference/utils/pp_option.py
+++ b/paddlex/inference/utils/pp_option.py
@@ -18,7 +18,7 @@
 
 from ...utils import logging
 from ...utils.device import get_default_device, parse_device, set_env_for_device_type
-from ...utils.flags import ENABLE_MKLDNN_BYDEFAULT, USE_PIR_TRT, DISABLE_DEVICE_FALLBACK
+from ...utils.flags import DISABLE_DEVICE_FALLBACK, ENABLE_MKLDNN_BYDEFAULT, USE_PIR_TRT
 from .misc import is_mkldnn_available
 from .mkldnn_blocklist import MKLDNN_BLOCKLIST
 from .new_ir_blocklist import NEWIR_BLOCKLIST
@@ -54,7 +54,17 @@ class PaddlePredictorOption(object):
         "mkldnn",
         "mkldnn_bf16",
     )
-    SUPPORT_DEVICE = ("gpu", "cpu", "npu", "xpu", "mlu", "dcu", "gcu", "iluvatar_gpu")
+    SUPPORT_DEVICE = (
+        "gpu",
+        "cpu",
+        "npu",
+        "xpu",
+        "mlu",
+        "dcu",
+        "gcu",
+        "iluvatar_gpu",
+        "metax_gpu",
+    )
 
     def __init__(self, **kwargs):
         super().__init__()
@@ -84,7 +94,10 @@ def setdefault_by_model_name(self, model_name):
         if self.device_type == "gpu":
             import paddle
 
-            if not (paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0):
+            if not (
+                paddle.device.is_compiled_with_cuda()
+                and paddle.device.cuda.device_count() > 0
+            ):
                 if DISABLE_DEVICE_FALLBACK:
                     raise RuntimeError(
                         "Device fallback is disabled and the specified device (GPU) is not available. "
diff --git a/paddlex/inference/utils/trt_config.py b/paddlex/inference/utils/trt_config.py
index 2625c57f8c..4b04a95bfe 100644
--- a/paddlex/inference/utils/trt_config.py
+++ b/paddlex/inference/utils/trt_config.py
@@ -406,6 +406,9 @@ def _load(self):
     "PP-YOLOE_plus-M": {
         "ops_run_float": {"pd_op.conv2d", "pd_op.fused_conv2d_add_act"}
     },
+    "RT-DETR-X": {
+        "disable_ops": ["pd_op.flip"],
+    },
 }
 
 
diff --git a/paddlex/modules/__init__.py b/paddlex/modules/__init__.py
index ca3370d812..b7078283e9 100644
--- a/paddlex/modules/__init__.py
+++ b/paddlex/modules/__init__.py
@@ -88,6 +88,24 @@
     TextRecExportor,
     TextRecTrainer,
 )
+from .text_to_pinyin import (
+    TextToPinyinDatasetChecker,
+    TextToPinyinEvaluator,
+    TextToPinyinExportor,
+    TextToPinyinTrainer,
+)
+from .text_to_speech_acoustic import (
+    TextToSpeechAcousticDatasetChecker,
+    TextToSpeechAcousticEvaluator,
+    TextToSpeechAcousticExportor,
+    TextToSpeechAcousticTrainer,
+)
+from .text_to_speech_vocoder import (
+    TextToSpeechVocoderDatasetChecker,
+    TextToSpeechVocoderEvaluator,
+    TextToSpeechVocoderExportor,
+    TextToSpeechVocoderTrainer,
+)
 from .ts_anomaly_detection import (
     TSADDatasetChecker,
     TSADEvaluator,
diff --git a/paddlex/modules/base/build_model.py b/paddlex/modules/base/build_model.py
index 32b1b96e72..1e828a9815 100644
--- a/paddlex/modules/base/build_model.py
+++ b/paddlex/modules/base/build_model.py
@@ -21,7 +21,7 @@ def build_model(model_name: str, config_path: str = None) -> tuple:
 
     Args:
         model_name (str): model name
-        device (str): device, such as gpu, cpu, npu, xpu, mlu, gcu
+        device (str): device, such as gpu, cpu, npu, xpu, mlu, gcu, metax_gpu
         config_path (str, optional): path to the PaddleX config yaml file.
             Defaults to None, i.e. using the default config file.
 
diff --git a/paddlex/modules/doc_vlm/model_list.py b/paddlex/modules/doc_vlm/model_list.py
index c09af0b36f..9475bec31b 100644
--- a/paddlex/modules/doc_vlm/model_list.py
+++ b/paddlex/modules/doc_vlm/model_list.py
@@ -19,4 +19,5 @@
     "PP-Chart2Table",
     "PP-DocBee2-3B",
     "PaddleOCR-VL-0.9B",
+    "PaddleOCR-VL-1.5-0.9B",
 ]
diff --git a/paddlex/modules/object_detection/model_list.py b/paddlex/modules/object_detection/model_list.py
index 0df27337fb..08569877d0 100644
--- a/paddlex/modules/object_detection/model_list.py
+++ b/paddlex/modules/object_detection/model_list.py
@@ -83,5 +83,6 @@
     "PP-DocLayout-S",
     "PP-DocLayout_plus-L",
     "PP-DocBlockLayout",
-    "PP-DocLayoutV2",
 ]
+
+LAYOUTANALYSIS_MODELS = ["PP-DocLayoutV2", "PP-DocLayoutV3"]
diff --git a/paddlex/modules/text_recognition/model_list.py b/paddlex/modules/text_recognition/model_list.py
index 5976e4b5dd..3a6236871b 100644
--- a/paddlex/modules/text_recognition/model_list.py
+++ b/paddlex/modules/text_recognition/model_list.py
@@ -43,5 +43,5 @@
     "cyrillic_PP-OCRv5_mobile_rec",
     "devanagari_PP-OCRv5_mobile_rec",
     "ta_PP-OCRv5_mobile_rec",
-    "te_PP-OCRv5_mobile_rec"
+    "te_PP-OCRv5_mobile_rec",
 ]
diff --git a/paddlex/modules/text_to_pinyin/__init__.py b/paddlex/modules/text_to_pinyin/__init__.py
new file mode 100644
index 0000000000..18848cb9ed
--- /dev/null
+++ b/paddlex/modules/text_to_pinyin/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset_checker import TextToPinyinDatasetChecker
+from .evaluator import TextToPinyinEvaluator
+from .exportor import TextToPinyinExportor
+from .trainer import TextToPinyinTrainer
diff --git a/paddlex/modules/text_to_pinyin/dataset_checker.py b/paddlex/modules/text_to_pinyin/dataset_checker.py
new file mode 100644
index 0000000000..d5f2b78d24
--- /dev/null
+++ b/paddlex/modules/text_to_pinyin/dataset_checker.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseDatasetChecker
+from .model_list import MODELS
+
+
+class TextToPinyinDatasetChecker(BaseDatasetChecker):
+    """Dataset Checker for G2PWModel Model"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("G2PWModel model not support data check for now.")
diff --git a/paddlex/modules/text_to_pinyin/evaluator.py b/paddlex/modules/text_to_pinyin/evaluator.py
new file mode 100644
index 0000000000..719bf3483e
--- /dev/null
+++ b/paddlex/modules/text_to_pinyin/evaluator.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseEvaluator
+from .model_list import MODELS
+
+
+class TextToPinyinEvaluator(BaseEvaluator):
+    """Instance G2PWModel Model Evaluator"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("G2PWModel model not support evaluate for now.")
diff --git a/paddlex/modules/text_to_pinyin/exportor.py b/paddlex/modules/text_to_pinyin/exportor.py
new file mode 100644
index 0000000000..cac6a15ec0
--- /dev/null
+++ b/paddlex/modules/text_to_pinyin/exportor.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseExportor
+from .model_list import MODELS
+
+
+class TextToPinyinExportor(BaseExportor):
+    """Instance G2PWModel Model Exportor"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("G2PWModel model not support export for now.")
diff --git a/paddlex/modules/text_to_pinyin/model_list.py b/paddlex/modules/text_to_pinyin/model_list.py
new file mode 100644
index 0000000000..0c4129180f
--- /dev/null
+++ b/paddlex/modules/text_to_pinyin/model_list.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+MODELS = [
+    "G2PWModel",
+]
diff --git a/paddlex/modules/text_to_pinyin/trainer.py b/paddlex/modules/text_to_pinyin/trainer.py
new file mode 100644
index 0000000000..4102cb1f4a
--- /dev/null
+++ b/paddlex/modules/text_to_pinyin/trainer.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseTrainer
+from .model_list import MODELS
+
+
+class TextToPinyinTrainer(BaseTrainer):
+    """Instance G2PWModel Model Trainer"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("G2PWModel model not support train for now.")
+
+    def update_config(self):
+        """update training config"""
+        pass
+
+    def get_train_kwargs(self) -> dict:
+        """get key-value arguments of model training function
+
+        Returns:
+            dict: the arguments of training function.
+        """
+        train_args = {"device": self.get_device()}
+        return train_args
diff --git a/paddlex/modules/text_to_speech_acoustic/__init__.py b/paddlex/modules/text_to_speech_acoustic/__init__.py
new file mode 100644
index 0000000000..da3a8e49ac
--- /dev/null
+++ b/paddlex/modules/text_to_speech_acoustic/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset_checker import TextToSpeechAcousticDatasetChecker
+from .evaluator import TextToSpeechAcousticEvaluator
+from .exportor import TextToSpeechAcousticExportor
+from .trainer import TextToSpeechAcousticTrainer
diff --git a/paddlex/modules/text_to_speech_acoustic/dataset_checker.py b/paddlex/modules/text_to_speech_acoustic/dataset_checker.py
new file mode 100644
index 0000000000..fc44b619ca
--- /dev/null
+++ b/paddlex/modules/text_to_speech_acoustic/dataset_checker.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseDatasetChecker
+from .model_list import MODELS
+
+
+class TextToSpeechAcousticDatasetChecker(BaseDatasetChecker):
+    """Dataset Checker for Fastspeech2 Model"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError(
+            "Fastspeech2Model model not support data check for now."
+        )
diff --git a/paddlex/modules/text_to_speech_acoustic/evaluator.py b/paddlex/modules/text_to_speech_acoustic/evaluator.py
new file mode 100644
index 0000000000..de8956ca17
--- /dev/null
+++ b/paddlex/modules/text_to_speech_acoustic/evaluator.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseEvaluator
+from .model_list import MODELS
+
+
+class TextToSpeechAcousticEvaluator(BaseEvaluator):
+    """Instance Fastspeech2 Model Evaluator"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("Fastspeech2 model not support evaluate for now.")
diff --git a/paddlex/modules/text_to_speech_acoustic/exportor.py b/paddlex/modules/text_to_speech_acoustic/exportor.py
new file mode 100644
index 0000000000..c475f903dc
--- /dev/null
+++ b/paddlex/modules/text_to_speech_acoustic/exportor.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseExportor
+from .model_list import MODELS
+
+
+class TextToSpeechAcousticExportor(BaseExportor):
+    """Instance Fastspeech2Model Model Exportor"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("Fastspeech2Model model not support export for now.")
diff --git a/paddlex/modules/text_to_speech_acoustic/model_list.py b/paddlex/modules/text_to_speech_acoustic/model_list.py
new file mode 100644
index 0000000000..e3527df335
--- /dev/null
+++ b/paddlex/modules/text_to_speech_acoustic/model_list.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODELS = [
+    "fastspeech2_csmsc",
+]
diff --git a/paddlex/modules/text_to_speech_acoustic/trainer.py b/paddlex/modules/text_to_speech_acoustic/trainer.py
new file mode 100644
index 0000000000..b47a2b8951
--- /dev/null
+++ b/paddlex/modules/text_to_speech_acoustic/trainer.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseTrainer
+from .model_list import MODELS
+
+
+class TextToSpeechAcousticTrainer(BaseTrainer):
+    """Instance Fastspeech2Model Model Trainer"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("Fastspeech2Model model not support train for now.")
+
+    def update_config(self):
+        """update training config"""
+        pass
+
+    def get_train_kwargs(self) -> dict:
+        """get key-value arguments of model training function
+
+        Returns:
+            dict: the arguments of training function.
+        """
+        train_args = {"device": self.get_device()}
+        return train_args
diff --git a/paddlex/modules/text_to_speech_vocoder/__init__.py b/paddlex/modules/text_to_speech_vocoder/__init__.py
new file mode 100644
index 0000000000..1c8d4fd93f
--- /dev/null
+++ b/paddlex/modules/text_to_speech_vocoder/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset_checker import TextToSpeechVocoderDatasetChecker
+from .evaluator import TextToSpeechVocoderEvaluator
+from .exportor import TextToSpeechVocoderExportor
+from .trainer import TextToSpeechVocoderTrainer
diff --git a/paddlex/modules/text_to_speech_vocoder/dataset_checker.py b/paddlex/modules/text_to_speech_vocoder/dataset_checker.py
new file mode 100644
index 0000000000..30e4d1528d
--- /dev/null
+++ b/paddlex/modules/text_to_speech_vocoder/dataset_checker.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseDatasetChecker
+from .model_list import MODELS
+
+
+class TextToSpeechVocoderDatasetChecker(BaseDatasetChecker):
+    """Dataset Checker for Fastspeech2 Model"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError(
+            "Fastspeech2Model model not support data check for now."
+        )
diff --git a/paddlex/modules/text_to_speech_vocoder/evaluator.py b/paddlex/modules/text_to_speech_vocoder/evaluator.py
new file mode 100644
index 0000000000..de073b815b
--- /dev/null
+++ b/paddlex/modules/text_to_speech_vocoder/evaluator.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseEvaluator
+from .model_list import MODELS
+
+
+class TextToSpeechVocoderEvaluator(BaseEvaluator):
+    """Instance PWGan Model Model Evaluator"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("PWGan model not support evaluate for now.")
diff --git a/paddlex/modules/text_to_speech_vocoder/exportor.py b/paddlex/modules/text_to_speech_vocoder/exportor.py
new file mode 100644
index 0000000000..8339fb5659
--- /dev/null
+++ b/paddlex/modules/text_to_speech_vocoder/exportor.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseExportor
+from .model_list import MODELS
+
+
+class TextToSpeechVocoderExportor(BaseExportor):
+    """Instance PWGanModel Model Exportor"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("PWGanModel model not support export for now.")
diff --git a/paddlex/modules/text_to_speech_vocoder/model_list.py b/paddlex/modules/text_to_speech_vocoder/model_list.py
new file mode 100644
index 0000000000..0f3eccef75
--- /dev/null
+++ b/paddlex/modules/text_to_speech_vocoder/model_list.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODELS = [
+    "pwgan_csmsc",
+]
diff --git a/paddlex/modules/text_to_speech_vocoder/trainer.py b/paddlex/modules/text_to_speech_vocoder/trainer.py
new file mode 100644
index 0000000000..2833b61b00
--- /dev/null
+++ b/paddlex/modules/text_to_speech_vocoder/trainer.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.errors import UnsupportedAPIError
+from ..base import BaseTrainer
+from .model_list import MODELS
+
+
+class TextToSpeechVocoderTrainer(BaseTrainer):
+    """Instance PWGanModel Model Trainer"""
+
+    entities = MODELS
+
+    def __init__(self, config):
+        # not support for now
+        raise UnsupportedAPIError("PWGanModel model not support train for now.")
+
+    def update_config(self):
+        """update training config"""
+        pass
+
+    def get_train_kwargs(self) -> dict:
+        """get key-value arguments of model training function
+
+        Returns:
+            dict: the arguments of training function.
+        """
+        train_args = {"device": self.get_device()}
+        return train_args
diff --git a/paddlex/paddlex_cli.py b/paddlex/paddlex_cli.py
index e80c5e4e22..78222f32d3 100644
--- a/paddlex/paddlex_cli.py
+++ b/paddlex/paddlex_cli.py
@@ -324,9 +324,6 @@ def _install_hpi_deps(device_type):
             )
 
     def _install_genai_deps(plugin_types):
-        if not is_cuda_available():
-            sys.exit("Currently, only GPU devices are supported.")
-
         fd_plugin_types = []
         not_fd_plugin_types = []
         for plugin_type in plugin_types:
@@ -372,21 +369,24 @@ def _install_genai_deps(plugin_types):
 
         for plugin_type in plugin_types:
             if "vllm" in plugin_type or "sglang" in plugin_type:
-                try:
-                    install_packages(["wheel"], constraints="required")
-                    cap = get_gpu_compute_capability()
-                    if cap >= (12, 0):
-                        install_packages(
-                            ["xformers", "flash-attn == 2.8.3"], constraints="required"
-                        )
-                    else:
-                        install_packages(
-                            ["xformers", "flash-attn == 2.8.2"], constraints="required"
-                        )
-                except Exception:
-                    logging.error("Installation failed", exc_info=True)
-                    sys.exit(1)
-                break
+                install_packages(["xformers"], constraints="required")
+                if is_cuda_available():
+                    try:
+                        install_packages(["wheel"], constraints="required")
+                        cap = get_gpu_compute_capability()
+                        assert cap is not None
+                        if cap >= (12, 0):
+                            install_packages(
+                                ["flash-attn == 2.8.3"], constraints="required"
+                            )
+                        else:
+                            install_packages(
+                                ["flash-attn == 2.8.2"], constraints="required"
+                            )
+                    except Exception:
+                        logging.error("Installation failed", exc_info=True)
+                        sys.exit(1)
+                    break
 
         logging.info(
             "Successfully installed the generative AI plugin"
diff --git a/paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py b/paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py
index bd916fc1f6..700512f70b 100644
--- a/paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py
+++ b/paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py
@@ -273,6 +273,9 @@ def update_device(self, device_type: str):
         elif device_type.lower() == "gcu":
             self["use_gcu"] = True
             self["use_gpu"] = False
+        elif device_type.lower() == "metax_gpu":
+            self["use_metax_gpu"] = True
+            self["use_gpu"] = False
         else:
             assert device_type.lower() == "cpu"
             self["use_gpu"] = False
diff --git a/paddlex/repo_apis/PaddleDetection_api/object_det/config.py b/paddlex/repo_apis/PaddleDetection_api/object_det/config.py
index 026be3aa22..8285bda439 100644
--- a/paddlex/repo_apis/PaddleDetection_api/object_det/config.py
+++ b/paddlex/repo_apis/PaddleDetection_api/object_det/config.py
@@ -294,6 +294,9 @@ def update_device(self, device_type: str):
         elif device_type.lower() == "gcu":
             self["use_gcu"] = True
             self["use_gpu"] = False
+        elif device_type.lower() == "metax_gpu":
+            self["use_metax_gpu"] = True
+            self["use_gpu"] = False
         else:
             assert device_type.lower() == "cpu"
             self["use_gpu"] = False
diff --git a/paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py b/paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py
index 57e8e4f7b8..11d67d2393 100644
--- a/paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py
+++ b/paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py
@@ -238,6 +238,7 @@ def update_device(self, device: str):
             "Global.use_npu": False,
             "Global.use_mlu": False,
             "Global.use_gcu": False,
+            "Global.use_metax_gpu": False,
         }
 
         device_cfg = {
@@ -247,6 +248,7 @@ def update_device(self, device: str):
             "mlu": {"Global.use_mlu": True},
             "npu": {"Global.use_npu": True},
             "gcu": {"Global.use_gcu": True},
+            "metax_gpu": {"Global.use_metax_gpu": True},
         }
         default_cfg.update(device_cfg[device])
         self.update(default_cfg)
diff --git a/paddlex/repo_apis/PaddleOCR_api/text_rec/config.py b/paddlex/repo_apis/PaddleOCR_api/text_rec/config.py
index 92dcc172d3..3c3fdb8c1a 100644
--- a/paddlex/repo_apis/PaddleOCR_api/text_rec/config.py
+++ b/paddlex/repo_apis/PaddleOCR_api/text_rec/config.py
@@ -250,6 +250,7 @@ def update_device(self, device: str):
             "Global.use_mlu": False,
             "Global.use_gcu": False,
             "Global.use_iluvatar_gpu": False,
+            "Global.use_metax_gpu": False,
         }
 
         device_cfg = {
@@ -260,6 +261,7 @@ def update_device(self, device: str):
             "npu": {"Global.use_npu": True},
             "gcu": {"Global.use_gcu": True},
             "iluvatar_gpu": {"Global.use_iluvatar_gpu": True},
+            "metax_gpu": {"Global.use_metax_gpu": True},
         }
         default_cfg.update(device_cfg[device])
         self.update(default_cfg)
diff --git a/paddlex/repo_apis/base/runner.py b/paddlex/repo_apis/base/runner.py
index 11722c254b..f9e11079ad 100644
--- a/paddlex/repo_apis/base/runner.py
+++ b/paddlex/repo_apis/base/runner.py
@@ -205,6 +205,8 @@ def distributed(self, device, ips=None, log_dir=None):
                 new_env["MLU_VISIBLE_DEVICES"] = dev_ids
             elif device == "gcu":
                 new_env["TOPS_VISIBLE_DEVICES"] = dev_ids
+            elif device == "metax_gpu":
+                new_env["MACA_VISIBLE_DEVICES"] = dev_ids
             else:
                 new_env["CUDA_VISIBLE_DEVICES"] = dev_ids
             return args, new_env
diff --git a/paddlex/utils/custom_device_list.py b/paddlex/utils/custom_device_list.py
old mode 100644
new mode 100755
index 88abe61db9..de06b0110e
--- a/paddlex/utils/custom_device_list.py
+++ b/paddlex/utils/custom_device_list.py
@@ -311,3 +311,131 @@
     "PP-OCRv4_mobile_rec",
     "PP-OCRv4_server_rec",
 ]
+
+METAX_GPU_WHITELIST = [
+    "ConvNeXt_base_224",
+    "ConvNeXt_base_384",
+    "ConvNeXt_large_224",
+    "ConvNeXt_large_384",
+    "ConvNeXt_small",
+    "ConvNeXt_tiny",
+    "FasterNet-L",
+    "FasterNet-M",
+    "FasterNet-S",
+    "FasterNet-T0",
+    "FasterNet-T1",
+    "FasterNet-T2",
+    "PP-LCNet_x1_0_doc_ori",
+    "UVDoc",
+    "PP-DocBlockLayout",
+    "PP-DocLayout_plus-L",
+    "PP-LCNet_x1_0_textline_ori",
+    "PP-LCNet_x1_0_table_cls",
+    "PP-OCRv5_server_rec",
+    "PP-OCRv5_server_det",
+    "SLANeXt_wired",
+    "SLANet_plus",
+    "RT-DETR-L_wired_table_cell_det",
+    "RT-DETR-L_wireless_table_cell_det",
+    "PP-FormulaNet_plus-L",
+    "MobileNetV1_x0_25",
+    "MobileNetV1_x0_5",
+    "MobileNetV1_x0_75",
+    "MobileNetV1_x1_0",
+    "MobileNetV2_x0_25",
+    "MobileNetV2_x0_5",
+    "MobileNetV2_x1_0",
+    "MobileNetV2_x1_5",
+    "MobileNetV2_x2_0",
+    "MobileNetV3_large_x0_35",
+    "MobileNetV3_large_x0_5",
+    "MobileNetV3_large_x0_75",
+    "MobileNetV3_large_x1_0",
+    "MobileNetV3_large_x1_25",
+    "MobileNetV3_small_x0_35",
+    "MobileNetV3_small_x0_5",
+    "MobileNetV3_small_x0_75",
+    "MobileNetV3_small_x1_0",
+    "MobileNetV3_small_x1_25",
+    "MobileNetV4_conv_large",
+    "MobileNetV4_conv_medium",
+    "MobileNetV4_conv_small",
+    "PP-HGNet_base",
+    "PP-HGNet_small",
+    "PP-HGNet_tiny",
+    "PP-HGNetV2-B0",
+    "PP-HGNetV2-B1",
+    "PP-HGNetV2-B2",
+    "PP-HGNetV2-B3",
+    "PP-HGNetV2-B4",
+    "PP-HGNetV2-B5",
+    "PP-HGNetV2-B6",
+    "PP-LCNet_x0_25",
+    "PP-LCNet_x0_35",
+    "PP-LCNet_x0_5",
+    "PP-LCNet_x0_75",
+    "PP-LCNet_x1_0",
+    "PP-LCNet_x1_5",
+    "PP-LCNet_x2_0",
+    "PP-LCNet_x2_5",
+    "PP-LCNetV2_base",
+    "PP-LCNetV2_large",
+    "PP-LCNetV2_small",
+    "ResNet18_vd",
+    "ResNet18",
+    "ResNet34_vd",
+    "ResNet34",
+    "ResNet50_vd",
+    "ResNet50",
+    "ResNet101_vd",
+    "ResNet101",
+    "ResNet152_vd",
+    "ResNet152",
+    "ResNet200_vd",
+    "StarNet-S1",
+    "StarNet-S2",
+    "StarNet-S3",
+    "StarNet-S4",
+    "FCOS-ResNet50",
+    "PicoDet-L",
+    "PicoDet-M",
+    "PicoDet-S",
+    "PicoDet-XS",
+    "PP-YOLOE_plus-L",
+    "PP-YOLOE_plus-M",
+    "PP-YOLOE_plus-S",
+    "PP-YOLOE_plus-X",
+    "RT-DETR-H",
+    "RT-DETR-L",
+    "RT-DETR-R18",
+    "RT-DETR-R50",
+    "RT-DETR-X",
+    "PP-YOLOE-L_human",
+    "PP-YOLOE-S_human",
+    "PP-OCRv4_mobile_det",
+    "PP-OCRv4_server_det",
+    "PP-OCRv4_mobile_rec",
+    "PP-OCRv4_server_rec",
+    "PP-DocLayoutV2",
+    "PP-ShiTuV2_rec",
+    "PP-ShiTuV2_det",
+    "PP-OCRv5_mobile_det",
+    "PP-OCRv4_server_seal_det",
+    "SLANet",
+    "MobileFaceNet",
+    "PP-LCNet_x0_25_textline_ori",
+    "PP-LCNet_x1_0_ML",
+    "PP-LCNet_x1_0_pedestrian_attribute",
+    "PP-LCNet_x1_0_vehicle_attribute",
+    "PP-FormulaNet_plus-M",
+    "PicoDet_LCNet_x2_5_face",
+    "PP-YOLOE_plus_SOD-S",
+    "PP-TinyPose_128x96",
+    "PP-LiteSeg-T",
+    "GroundingDINO-T",
+    "STFPM",
+    "DLinear",
+    "AutoEncoder_ad",
+    "SAM-H_box",
+    "PP-YOLOE-S_vehicle",
+]
diff --git a/paddlex/utils/deps.py b/paddlex/utils/deps.py
index c1781308b8..cef9ff80ea 100644
--- a/paddlex/utils/deps.py
+++ b/paddlex/utils/deps.py
@@ -107,6 +107,8 @@ def is_dep_available(dep, /, check_version=False):
         return importlib.util.find_spec("ultra_infer") is not None
     elif dep == "fastdeploy":
         return importlib.util.find_spec("fastdeploy") is not None
+    elif dep == "onnxruntime":
+        return importlib.util.find_spec("onnxruntime") is not None
     version = get_dep_version(dep)
     if version is None:
         return False
@@ -308,7 +310,7 @@ def get_genai_fastdeploy_spec(device_type):
     if device_type not in SUPPORTED_DEVICE_TYPES:
         raise ValueError(f"Unsupported device type: {device_type}")
     if device_type == "gpu":
-        return "fastdeploy-gpu == 2.3.0rc0"
+        return "fastdeploy-gpu == 2.3.0"
     else:
         raise AssertionError
 
diff --git a/paddlex/utils/device.py b/paddlex/utils/device.py
index 04fa0537b8..e065febed0 100644
--- a/paddlex/utils/device.py
+++ b/paddlex/utils/device.py
@@ -19,13 +19,24 @@
 from .custom_device_list import (
     DCU_WHITELIST,
     GCU_WHITELIST,
+    METAX_GPU_WHITELIST,
     MLU_WHITELIST,
     NPU_BLACKLIST,
     XPU_WHITELIST,
 )
 from .flags import DISABLE_DEV_MODEL_WL
 
-SUPPORTED_DEVICE_TYPE = ["cpu", "gpu", "xpu", "npu", "mlu", "gcu", "dcu", "iluvatar_gpu"]
+SUPPORTED_DEVICE_TYPE = [
+    "cpu",
+    "gpu",
+    "xpu",
+    "npu",
+    "mlu",
+    "gcu",
+    "dcu",
+    "iluvatar_gpu",
+    "metax_gpu",
+]
 
 
 def constr_device(device_type, device_ids):
@@ -116,6 +127,9 @@ def _set(envs):
             "XPU_BLACK_LIST": "pad3d",
         }
         _set(envs)
+    if device_type.lower() == "metax_gpu":
+        envs = {"FLAGS_use_stride_kernel": "0"}
+        _set(envs)
     if device_type.lower() == "mlu":
         envs = {
             "FLAGS_use_stride_kernel": "0",
@@ -142,6 +156,11 @@ def check_supported_device_type(device_type, model_name):
         assert model_name in MLU_WHITELIST, (
             f"The MLU device does not yet support `{model_name}` model!" + tips
         )
+    elif device_type == "metax_gpu":
+        assert model_name in METAX_GPU_WHITELIST, (
+            f"The METAX_GPU device does not yet support `{model_name}` model!" + tips
+        )
+
     elif device_type == "npu":
         assert model_name not in NPU_BLACKLIST, (
             f"The NPU device does not yet support `{model_name}` model!" + tips
diff --git a/paddlex/utils/env.py b/paddlex/utils/env.py
index 1991d0ec51..f42b0130c5 100644
--- a/paddlex/utils/env.py
+++ b/paddlex/utils/env.py
@@ -65,13 +65,18 @@ def is_cuda_available():
         import paddle.device
 
         # TODO: Check runtime availability
-        return paddle.device.is_compiled_with_cuda()
+        return (
+            paddle.device.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm()
+        )
     else:
         # If Paddle is unavailable, check GPU availability using PyTorch API.
         require_deps("torch")
+
         import torch.cuda
+        import torch.version
 
-        return torch.cuda.is_available()
+        # Distinguish GPUs and DCUs by checking `torch.version.cuda`
+        return torch.cuda.is_available() and torch.version.cuda
 
 
 def get_gpu_compute_capability():
@@ -85,6 +90,7 @@ def get_gpu_compute_capability():
         else:
             # If Paddle is unavailable, retrieve GPU compute capability from PyTorch instead.
             require_deps("torch")
+
             import torch.cuda
 
             cap = torch.cuda.get_device_capability()
diff --git a/paddlex/utils/flags.py b/paddlex/utils/flags.py
index b49b2160e2..1fcf547335 100644
--- a/paddlex/utils/flags.py
+++ b/paddlex/utils/flags.py
@@ -66,7 +66,11 @@ def get_flag_from_env_var(name, default, format_func=str):
 )
 
 MODEL_SOURCE = os.environ.get("PADDLE_PDX_MODEL_SOURCE", "huggingface").lower()
+DISABLE_MODEL_SOURCE_CHECK = os.environ.get(
+    "PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", False
+)
 
+HUGGING_FACE_ENDPOINT = os.environ.get("PADDLE_PDX_HUGGING_FACE_ENDPOINT", "https://huggingface.co")
 
 # Inference Benchmark
 INFER_BENCHMARK = get_flag_from_env_var("PADDLE_PDX_INFER_BENCHMARK", False)
diff --git a/paddlex/utils/pipeline_arguments.py b/paddlex/utils/pipeline_arguments.py
index ff31dc1eec..386cb02e08 100644
--- a/paddlex/utils/pipeline_arguments.py
+++ b/paddlex/utils/pipeline_arguments.py
@@ -805,4 +805,5 @@ def validator(cli_input: str) -> cli_expected_type:
     ],
     "3d_bev_detection": None,
     "multilingual_speech_recognition": None,
+    "text_to_speech": None,
 }
diff --git a/paddlex/version.py b/paddlex/version.py
index 28e2296990..63e81e1e4e 100644
--- a/paddlex/version.py
+++ b/paddlex/version.py
@@ -14,16 +14,30 @@
 
 
 import os
+import sys
 
 __all__ = ["get_pdx_version", "get_version_dict", "show_versions"]
 
 
+def _get_package_dir():
+    """Get the paddlex package directory, compatible with PyInstaller."""
+    # When running in a PyInstaller bundle, sys._MEIPASS points to the
+    # temporary folder where the bundled files are extracted
+    if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
+        return os.path.join(sys._MEIPASS, "paddlex")
+    return os.path.dirname(__file__)
+
+
 def get_pdx_version():
     """get_pdx_version"""
-    with open(
-        os.path.join(os.path.dirname(__file__), ".version"), "r", encoding="ascii"
-    ) as fv:
-        ver = fv.read().rstrip()
+    version_file = os.path.join(_get_package_dir(), ".version")
+    try:
+        with open(version_file, "r", encoding="ascii") as fv:
+            ver = fv.read().rstrip()
+    except FileNotFoundError:
+        # Fallback version if .version file is not found (e.g., in some
+        # PyInstaller configurations where the file wasn't included)
+        ver = "unknown"
     return ver
 
 
diff --git a/setup.py b/setup.py
index 4c1a69e7a0..c2401ff4c1 100644
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@
     "GPUtil": ">= 1.4",
     "huggingface-hub": "",
     "imagesize": "",
+    "jieba": "",
     "Jinja2": "",
     "joblib": "",
     "langchain": ">= 0.2, < 1.0",
@@ -48,6 +49,7 @@
     "modelscope": ">=1.28.0",
     "numpy": ">= 1.24",
     "openai": ">= 1.63",
+    "OpenCC": "",
     "opencv-contrib-python": "== 4.10.0.84",
     "openpyxl": "",
     "packaging": "",
@@ -60,14 +62,16 @@
     "pycocotools": "<= 2.0.8",  # pycocotools upgrade incompatible since 2.0.9
     "pydantic": ">= 2",
     "pypdfium2": ">= 4",
+    "pypinyin": "",
     "python-bidi": "",
     "PyYAML": "== 6.0.2",
     "regex": "",
     "requests": "",
     "ruamel.yaml": "",
-    "safetensors": "",
+    "safetensors": ">= 0.7.0",
     "scikit-image": "",
     "scikit-learn": "",
+    "scipy": "",
     "sentencepiece": "",
     "shapely": "",
     "soundfile": "",
@@ -110,6 +114,7 @@
             # Currently `pypdfium2` is required by the image batch sampler
             "pypdfium2",
             "scikit-image",
+            "scipy",
         ],
         "multimodal": [
             "einops",
@@ -139,6 +144,7 @@
             "pyclipper",
             "pypdfium2",
             "scikit-learn",
+            "scipy",
             "shapely",
             "tokenizers",
         ],
@@ -155,6 +161,7 @@
             "pyclipper",
             "pypdfium2",
             "scikit-learn",
+            "scipy",
             "shapely",
             "tokenizers",
         ],
@@ -181,14 +188,19 @@
             "regex",
             "safetensors",
             "scikit-learn",
+            "scipy",
             "sentencepiece",
             "shapely",
             "tiktoken",
             "tokenizers",
+            "beautifulsoup4",
         ],
         "speech": [
             "ftfy",
+            "jieba",
             "Jinja2",
+            "OpenCC",
+            "pypinyin",
             "regex",
             "soundfile",
             "tqdm",
diff --git a/tests/run_xpu_ci.sh b/tests/run_xpu_ci.sh
new file mode 100644
index 0000000000..0e71b47c9c
--- /dev/null
+++ b/tests/run_xpu_ci.sh
@@ -0,0 +1,16 @@
+wget https://paddle-model-ecology.bj.bcebos.com/paddlex/tmp/PaddleTest.tar
+tar -xf PaddleTest.tar && rm -rf  PaddleTest.tar 
+xpu-smi 
+python --version
+python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl
+python -c "import paddle; paddle.version.show()"
+cp -r PaddleTest/models/PaddleX/ci ./
+export DEVICE_ID=6,7
+export PADDLE_PDX_DISABLE_DEV_MODEL_WL=true
+export MEM_SIZE=32
+export DEVICE_TYPE=xpu
+rm -rf  ci/pr_list.txt
+mv ci/pr_list_xpu.txt  ci/pr_list.txt
+export PIP_DEFAULT_RETRIES=1
+bash ci/ci_run.sh