deepmodeling · njzjz · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
@@ -33,6 +33,7 @@ jobs:
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py --group pin_jax
         source/install/uv_with_retry.sh pip install --system --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cpu/paddlepaddle/" --index-url https://pypi.org/simple paddlepaddle==3.3.0.dev20251204
+        source/install/uv_with_retry.sh pip install --system --no-build-isolation source/3rdparty/torch_openreg
       env:
         # Please note that uv has some issues with finding
         # existing TensorFlow package. Currently, it uses
@@ -62,6 +63,7 @@ jobs:
         NUM_WORKERS: 0
         DP_CI_IMPORT_PADDLE_BEFORE_TF: 1
         FLAGS_use_stride_compute_kernel: 0
+        DEVICE: openreg
     - name: Test TF2 eager mode
       run: pytest --cov=deepmd --cov-append source/tests/consistent/io/test_io.py source/jax2tf_tests
       env:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,5 +1,6 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
+exclude: ^source/3rdparty/.+/
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v6.0.0

diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py
@@ -48,7 +48,11 @@
 LOCAL_RANK = os.environ.get("LOCAL_RANK")
 LOCAL_RANK = int(0 if LOCAL_RANK is None else LOCAL_RANK)
 
-if os.environ.get("DEVICE") == "cpu" or torch.cuda.is_available() is False:
+if os.environ.get("DEVICE") == "openreg":
+    if not torch.openreg.is_available():
+        raise RuntimeError("OpenReg backend is not available in this build.")
+    DEVICE = torch.device("openreg")
+elif os.environ.get("DEVICE") == "cpu" or torch.cuda.is_available() is False:
     DEVICE = torch.device("cpu")
 else:
     DEVICE = torch.device(f"cuda:{LOCAL_RANK}")

diff --git a/pyproject.toml b/pyproject.toml
@@ -163,7 +163,7 @@ pin_tensorflow_gpu = [
 pin_pytorch_cpu = [
   # https://github.com/pytorch/pytorch/issues/114602
   # macos x86 has been deprecated
-  "torch>=2.8,<2.10; platform_machine!='x86_64' or platform_system != 'Darwin'",
+  "torch==2.10.0; platform_machine!='x86_64' or platform_system != 'Darwin'",
   "torch; platform_machine=='x86_64' and platform_system == 'Darwin'",
 ]
 pin_pytorch_gpu = [

diff --git a/source/3rdparty/README.md b/source/3rdparty/README.md
@@ -5,3 +5,4 @@
 | json                      | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
 | Implib.so                 | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
 | coverage_plugins          | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
+| torch_openreg             | https://github.com/pytorch/pytorch | 2.10.0  | BSD-3   |
diff --git a/source/3rdparty/torch_openreg/.gitignore b/source/3rdparty/torch_openreg/.gitignore
@@ -0,0 +1,2 @@
+*.egg-info/
+build/
diff --git a/source/3rdparty/torch_openreg/CMakeLists.txt b/source/3rdparty/torch_openreg/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(TORCH_OPENREG CXX C)
+
+include(GNUInstallDirs)
+include(CheckCXXCompilerFlag)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+
+set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+
+if(APPLE)
+  set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
+elseif(UNIX)
+  set(CMAKE_INSTALL_RPATH "$ORIGIN/lib:$ORIGIN")
+elseif(WIN32)
+  set(CMAKE_INSTALL_RPATH "")
+endif()
+set(CMAKE_INSTALL_LIBDIR lib)
+set(CMAKE_INSTALL_MESSAGE NEVER)
+
+set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch)
+find_package(Torch REQUIRED)
+
+if(DEFINED PYTHON_INCLUDE_DIR)
+  include_directories(${PYTHON_INCLUDE_DIR})
+else()
+  message(FATAL_ERROR "Cannot find Python directory")
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)
+
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
+add_subdirectory(${PROJECT_SOURCE_DIR}/csrc)
+add_subdirectory(${PROJECT_SOURCE_DIR}/torch_openreg/csrc)
diff --git a/source/3rdparty/torch_openreg/README.md b/source/3rdparty/torch_openreg/README.md
@@ -0,0 +1,194 @@
+# PyTorch OpenReg
+
+## Background
+
+The third-party device integration mechanism based on PrivateUse1 has become the official mainstream method for new backends to integrate with PyTorch. Ensuring the availability of this mechanism is crucial for enriching PyTorch's hardware ecosystem.
+
+**Note:**
+
+The goal of `torch_openreg` is **not to implement a fully functional, high-performance PyTorch backend**, but to serve as a **minimalist reference implementation for mechanism verification**.
+
+### Purpose
+
+- **Test Backend**: To serve as an in-tree test backend for PrivateUse1, ensuring quality stability through CI/CD.
+- **Integration Example**: To serve as a reference example for new backend integration.
+- **Integration Documentation**: To provide module-level integration documentation that corresponds with the code.
+
+### Design Principles
+
+- **Minimality Principle**: The fundamental goal is to enable/verify all integration paths/mechanisms for a new backend to integrate to PyTorch. All functions follow a "just right" strategy to ensure the correctness of relevant integration capabilities.
+- **Authenticity Principle**: To complete the OpenReg integration in the same way a real accelerator backend would integrate with PyTorch.
+
+## Directory Structure
+
+```shell
+torch_openreg/
+├── CMakeLists.txt
+├── csrc
+│   ├── amp
+│   │   └── autocast_mode.cpp
+│   ├── aten
+│   │   ├── native
+│   │   │   ├── Extra.cpp
+│   │   │   ├── Minimal.cpp
+│   │   │   └── ...
+│   │   ├── OpenRegExtra.cpp
+│   │   └── OpenRegMinimal.cpp
+│   ├── CMakeLists.txt
+│   └── runtime
+│       ├── OpenRegDeviceAllocator.cpp
+│       ├── OpenRegDeviceAllocator.h
+│       ├── OpenRegFunctions.cpp
+│       ├── OpenRegFunctions.h
+│       ├── OpenRegGenerator.cpp
+│       ├── OpenRegGenerator.h
+│       ├── OpenRegGuard.cpp
+│       ├── OpenRegGuard.h
+│       ├── OpenRegHooks.cpp
+│       ├── OpenRegHooks.h
+│       ├── OpenRegHostAllocator.cpp
+│       ├── OpenRegHostAllocator.h
+│       └── ...
+├── pyproject.toml
+├── README.md
+├── setup.py
+├── third_party
+│   └── openreg
+└── torch_openreg
+    ├── csrc
+    │   ├── CMakeLists.txt
+    │   ├── Module.cpp
+    │   └── stub.c
+    ├── __init__.py
+    └── openreg
+        ├── amp
+        │   └── __init__.py
+        ├── __init__.py
+        ├── meta.py
+        └── random.py
+```
+
+**Dependencies**:
+
+```mermaid
+graph LR
+    A[Python]
+    B[_C.so]
+    C[libtorch_bindings.so]
+    D[libtorch_openreg.so]
+    E[libopenreg.so]
+
+    A --> B --> C --> D --> E
+```
+
+There are 4 DSOs in torch_openreg, and the dependencies between them are as follows:
+
+- `_C.so`:
+  - **sources**: torch_openreg/csrc/stub.c
+  - **description**: Python C module entry point.
+- `libtorch_bindings.so`: The bridging code between Python and C++ should go here.
+  - **sources**: torch_openreg/csrc
+  - **description**: A thin glue layer between Python and C++.
+- `libtorch_openreg.so`: All core implementations should go here.
+  - **sources**: csrc
+  - **description**: All core functionality, such as device runtime, operators, etc.
+- `libopenreg.so`: A DSO that uses the CPU to emulate a CUDA-like device, you can ignore it.
+  - **sources**: third_party/openreg
+  - **description**: Provides low-level device functionality similar to libcudart.so.
+
+**Key Directories**:
+
+- `csrc/`: Core device implementation, including operator registration, runtime, etc.
+  - `csrc/amp/`: AMP(Automatic Mixed Precision)
+  - `csrc/aten/`: Operator registration
+    - `csrc/aten/native/`: Specific operator implementations for the OpenReg device.
+      - `csrc/aten/native/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
+      - `csrc/aten/native/OpenRegExtra.cpp`: Implementations for other types of operators.
+  - `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc.
+- `third_party/`: A C++ library that simulates a CUDA-like device using the CPU.
+- `torch_openreg/`: Python interface implementation (Python code and C++ Bindings).
+  - `torch_openreg/csrc/`: Python C++ binding code.
+  - `torch_openreg/openreg/`: Python API.
+
+## Currently Implemented Features
+
+### Operator Registration
+
+- Operator Implementation
+
+  - Register for builtin PyTorch Operators
+    - `TORCH_LIBRARY_IMPL` form: See `empty.memory_format
+    - `STUB` form: See `abs_stub`
+  - Register for custom operators
+    - Schema Registration: See `custom_abs`
+    - Kernel Registration: See `custom_abs`
+    - Fallback Registration for `AutogradPriavateUse1`: See `custom_abs`
+    - Meta Registration: See `custom_abs`
+    - `torch.autograd.Function`: See `custom_autograd_fn_aliasing`
+  - Register for fallback
+    - Per-operator Fallback: See `sub.Tensor`
+    - Global Fallback: See `wrapper_cpu_fallback`
+
+### Autoload
+
+When `import torch`, installed accelerators (such as `torch_openreg`) will be automatically loaded, achieving the same experience as the built-in backends.
+
+- Register the backend with Python `entry points`: See `setup` in `setup.py`
+- Add a callable function for backend initialization: See `_autoload` in `torch_openreg/__init__.py`
+- Dynamically loading the backend without explicit imports: See [Usage Example](#usage-example)
+
+### AMP(Automatic Mixed Precision)
+
+`AMP` provides convenience methods for mixed precision, where some operations use the `torch.float32` datatype and other operations use `lower precision` floating point datatype: `torch.float16` or `torch.bfloat16`.
+
+- Register specific operator conversion rules: See `autocat_mode.cpp` in `csrc/amp`.
+- Add support for new data types for different accelerators: See `get_amp_supported_dtype` in `torch_openreg/openreg/amp/__init__.py`
+
+## Installation and Usage
+
+### Installation
+
+```python
+python -m pip install --no-build-isolation -e . # for develop
+python -m pip install --no-build-isolation . # for install
+```
+
+### Usage Example
+
+After installation, you can use the `openreg` device in Python just like any other regular device.
+
+```python
+import torch
+
+if not torch.openreg.is_available():
+    print("OpenReg backend is not available in this build.")
+    exit()
+
+print("OpenReg backend is available!")
+
+device = torch.device("openreg")
+
+x = torch.tensor([[1., 2.], [3., 4.]], device=device)
+y = x + 2
+print("Result y:\n", y)
+print(f"Device of y: {y.device}")
+
+z = y.cpu()
+print("Result z:\n", z)
+print(f"Device of z: {z.device}")
+```
+
+## Documentation
+
+Please refer to [this](https://docs.pytorch.org/docs/main/accelerator/index.html) for a series of documents on integrating new accelerators into PyTorch, which will be kept in sync with the `OpenReg` codebase as well.
+
+## Future Plans
+
+- **Enhance Features**:
+  - Device-agnostic APIs
+  - Memory Management
+  - Generator
+  - Distributed
+  - Custom Tensor&Storage
+  - ...
+- **Improve Tests**: Add more test cases related to the integration mechanism.
diff --git a/source/3rdparty/torch_openreg/cmake/TorchPythonTargets.cmake b/source/3rdparty/torch_openreg/cmake/TorchPythonTargets.cmake
@@ -0,0 +1,20 @@
+if(WIN32)
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_IMPORT_LIBRARY_PREFIX}torch_python${CMAKE_IMPORT_LIBRARY_SUFFIX}")
+else()
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}torch_python${CMAKE_SHARED_LIBRARY_SUFFIX}")
+endif()
+
+add_library(torch_python SHARED IMPORTED)
+
+set_target_properties(torch_python PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${PYTORCH_INSTALL_DIR}/include"
+  INTERFACE_LINK_LIBRARIES "c10;torch_cpu"
+  IMPORTED_LOCATION "${TORCH_PYTHON_IMPORTED_LOCATION}"
+)
+
+add_library(torch_python_library INTERFACE IMPORTED)
+
+set_target_properties(torch_python_library PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch_python,INTERFACE_INCLUDE_DIRECTORIES>"
+  INTERFACE_LINK_LIBRARIES "\$<TARGET_FILE:torch_python>;\$<TARGET_PROPERTY:torch_python,INTERFACE_LINK_LIBRARIES>"
+)
diff --git a/source/3rdparty/torch_openreg/csrc/CMakeLists.txt b/source/3rdparty/torch_openreg/csrc/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LIBRARY_NAME torch_openreg)
+
+file(GLOB_RECURSE SOURCE_FILES
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
+)
+
+add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
+
+target_link_libraries(${LIBRARY_NAME} PRIVATE torch_cpu_library openreg)
+target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/source/3rdparty/torch_openreg/csrc/amp/autocast_mode.cpp b/source/3rdparty/torch_openreg/csrc/amp/autocast_mode.cpp
@@ -0,0 +1,37 @@
+#include <ATen/autocast_mode.h>
+
+using at::Tensor;
+
+Tensor binary_cross_entropy_banned(
+    const Tensor&,
+    const Tensor&,
+    const std::optional<Tensor>&,
+    int64_t) {
+  TORCH_CHECK(
+      false,
+      "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
+      "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
+      "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
+      "or torch.nn.BCEWithLogitsLoss.  binary_cross_entropy_with_logits and BCEWithLogits are\n"
+      "safe to autocast.");
+}
+
+// LITERALINCLUDE START: AMP FALLTHROUTH
+TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+// LITERALINCLUDE END: AMP FALLTHROUTH
+
+// LITERALINCLUDE START: AMP IMPL
+TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
+  // lower_precision_fp
+  KERNEL_PRIVATEUSEONE(mm, lower_precision_fp)
+
+  // fp32
+  KERNEL_PRIVATEUSEONE(asin, fp32)
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
+      TORCH_FN((&binary_cross_entropy_banned)));
+}
+// LITERALINCLUDE END: AMP IMPL