Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/test_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py --group pin_jax
source/install/uv_with_retry.sh pip install --system --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cpu/paddlepaddle/" --index-url https://pypi.org/simple paddlepaddle==3.3.0.dev20251204
source/install/uv_with_retry.sh pip install --system --no-build-isolation source/3rdparty/torch_openreg
env:
# Please note that uv has some issues with finding
# existing TensorFlow package. Currently, it uses
Expand Down Expand Up @@ -62,6 +63,7 @@ jobs:
NUM_WORKERS: 0
DP_CI_IMPORT_PADDLE_BEFORE_TF: 1
FLAGS_use_stride_compute_kernel: 0
DEVICE: openreg
- name: Test TF2 eager mode
run: pytest --cov=deepmd --cov-append source/tests/consistent/io/test_io.py source/jax2tf_tests
env:
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: ^source/3rdparty/.+/
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
Expand Down
6 changes: 5 additions & 1 deletion deepmd/pt/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@
LOCAL_RANK = os.environ.get("LOCAL_RANK")
LOCAL_RANK = int(0 if LOCAL_RANK is None else LOCAL_RANK)

if os.environ.get("DEVICE") == "cpu" or torch.cuda.is_available() is False:
if os.environ.get("DEVICE") == "openreg":
if not torch.openreg.is_available():
raise RuntimeError("OpenReg backend is not available in this build.")
DEVICE = torch.device("openreg")
elif os.environ.get("DEVICE") == "cpu" or torch.cuda.is_available() is False:
DEVICE = torch.device("cpu")
else:
DEVICE = torch.device(f"cuda:{LOCAL_RANK}")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ pin_tensorflow_gpu = [
pin_pytorch_cpu = [
# https://github.com/pytorch/pytorch/issues/114602
# macos x86 has been deprecated
"torch>=2.8,<2.10; platform_machine!='x86_64' or platform_system != 'Darwin'",
"torch==2.10.0; platform_machine!='x86_64' or platform_system != 'Darwin'",
"torch; platform_machine=='x86_64' and platform_system == 'Darwin'",
]
pin_pytorch_gpu = [
Expand Down
1 change: 1 addition & 0 deletions source/3rdparty/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
| json | https://github.com/nlohmann/json | 3.9.1 | MIT |
| Implib.so | https://github.com/yugr/Implib.so | 0ddaa71 | MIT |
| coverage_plugins | https://github.com/pytorch/pytorch | 2.2.0 | BSD-3 |
| torch_openreg | https://github.com/pytorch/pytorch | 2.10.0 | BSD-3 |
2 changes: 2 additions & 0 deletions source/3rdparty/torch_openreg/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.egg-info/
build/
44 changes: 44 additions & 0 deletions source/3rdparty/torch_openreg/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)

project(TORCH_OPENREG CXX C)

include(GNUInstallDirs)
include(CheckCXXCompilerFlag)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_EXTENSIONS OFF)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_SKIP_BUILD_RPATH FALSE)
set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
set(CMAKE_CXX_VISIBILITY_PRESET hidden)

set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)

if(APPLE)
set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
elseif(UNIX)
set(CMAKE_INSTALL_RPATH "$ORIGIN/lib:$ORIGIN")
elseif(WIN32)
set(CMAKE_INSTALL_RPATH "")
endif()
set(CMAKE_INSTALL_LIBDIR lib)
set(CMAKE_INSTALL_MESSAGE NEVER)

set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch)
find_package(Torch REQUIRED)

if(DEFINED PYTHON_INCLUDE_DIR)
include_directories(${PYTHON_INCLUDE_DIR})
else()
message(FATAL_ERROR "Cannot find Python directory")
endif()

include_directories(${CMAKE_CURRENT_SOURCE_DIR})
include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)

add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
add_subdirectory(${PROJECT_SOURCE_DIR}/csrc)
add_subdirectory(${PROJECT_SOURCE_DIR}/torch_openreg/csrc)
194 changes: 194 additions & 0 deletions source/3rdparty/torch_openreg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# PyTorch OpenReg

## Background

The third-party device integration mechanism based on PrivateUse1 has become the official mainstream method for new backends to integrate with PyTorch. Ensuring the availability of this mechanism is crucial for enriching PyTorch's hardware ecosystem.

**Note:**

The goal of `torch_openreg` is **not to implement a fully functional, high-performance PyTorch backend**, but to serve as a **minimalist reference implementation for mechanism verification**.

### Purpose

- **Test Backend**: To serve as an in-tree test backend for PrivateUse1, ensuring quality stability through CI/CD.
- **Integration Example**: To serve as a reference example for new backend integration.
- **Integration Documentation**: To provide module-level integration documentation that corresponds with the code.

### Design Principles

- **Minimality Principle**: The fundamental goal is to enable/verify all integration paths/mechanisms for a new backend to integrate to PyTorch. All functions follow a "just right" strategy to ensure the correctness of relevant integration capabilities.
- **Authenticity Principle**: To complete the OpenReg integration in the same way a real accelerator backend would integrate with PyTorch.

## Directory Structure

```shell
torch_openreg/
├── CMakeLists.txt
├── csrc
│ ├── amp
│ │ └── autocast_mode.cpp
│ ├── aten
│ │ ├── native
│ │ │ ├── Extra.cpp
│ │ │ ├── Minimal.cpp
│ │ │ └── ...
│ │ ├── OpenRegExtra.cpp
│ │ └── OpenRegMinimal.cpp
│ ├── CMakeLists.txt
│ └── runtime
│ ├── OpenRegDeviceAllocator.cpp
│ ├── OpenRegDeviceAllocator.h
│ ├── OpenRegFunctions.cpp
│ ├── OpenRegFunctions.h
│ ├── OpenRegGenerator.cpp
│ ├── OpenRegGenerator.h
│ ├── OpenRegGuard.cpp
│ ├── OpenRegGuard.h
│ ├── OpenRegHooks.cpp
│ ├── OpenRegHooks.h
│ ├── OpenRegHostAllocator.cpp
│ ├── OpenRegHostAllocator.h
│ └── ...
├── pyproject.toml
├── README.md
├── setup.py
├── third_party
│ └── openreg
└── torch_openreg
├── csrc
│ ├── CMakeLists.txt
│ ├── Module.cpp
│ └── stub.c
├── __init__.py
└── openreg
├── amp
│ └── __init__.py
├── __init__.py
├── meta.py
└── random.py
```

**Dependencies**:

```mermaid
graph LR
A[Python]
B[_C.so]
C[libtorch_bindings.so]
D[libtorch_openreg.so]
E[libopenreg.so]

A --> B --> C --> D --> E
```

There are 4 DSOs in torch_openreg, and the dependencies between them are as follows:

- `_C.so`:
- **sources**: torch_openreg/csrc/stub.c
- **description**: Python C module entry point.
- `libtorch_bindings.so`: The bridging code between Python and C++ should go here.
- **sources**: torch_openreg/csrc
- **description**: A thin glue layer between Python and C++.
- `libtorch_openreg.so`: All core implementations should go here.
- **sources**: csrc
- **description**: All core functionality, such as device runtime, operators, etc.
- `libopenreg.so`: A DSO that uses the CPU to emulate a CUDA-like device, you can ignore it.
- **sources**: third_party/openreg
- **description**: Provides low-level device functionality similar to libcudart.so.

**Key Directories**:

- `csrc/`: Core device implementation, including operator registration, runtime, etc.
- `csrc/amp/`: AMP(Automatic Mixed Precision)
- `csrc/aten/`: Operator registration
- `csrc/aten/native/`: Specific operator implementations for the OpenReg device.
- `csrc/aten/native/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
- `csrc/aten/native/OpenRegExtra.cpp`: Implementations for other types of operators.
- `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc.
- `third_party/`: A C++ library that simulates a CUDA-like device using the CPU.
- `torch_openreg/`: Python interface implementation (Python code and C++ Bindings).
- `torch_openreg/csrc/`: Python C++ binding code.
- `torch_openreg/openreg/`: Python API.

## Currently Implemented Features

### Operator Registration

- Operator Implementation

- Register for builtin PyTorch Operators
- `TORCH_LIBRARY_IMPL` form: See `empty.memory_format
- `STUB` form: See `abs_stub`
- Register for custom operators
- Schema Registration: See `custom_abs`
- Kernel Registration: See `custom_abs`
- Fallback Registration for `AutogradPriavateUse1`: See `custom_abs`
- Meta Registration: See `custom_abs`
- `torch.autograd.Function`: See `custom_autograd_fn_aliasing`
- Register for fallback
- Per-operator Fallback: See `sub.Tensor`
- Global Fallback: See `wrapper_cpu_fallback`

### Autoload

When `import torch`, installed accelerators (such as `torch_openreg`) will be automatically loaded, achieving the same experience as the built-in backends.

- Register the backend with Python `entry points`: See `setup` in `setup.py`
- Add a callable function for backend initialization: See `_autoload` in `torch_openreg/__init__.py`
- Dynamically loading the backend without explicit imports: See [Usage Example](#usage-example)

### AMP(Automatic Mixed Precision)

`AMP` provides convenience methods for mixed precision, where some operations use the `torch.float32` datatype and other operations use `lower precision` floating point datatype: `torch.float16` or `torch.bfloat16`.

- Register specific operator conversion rules: See `autocat_mode.cpp` in `csrc/amp`.
- Add support for new data types for different accelerators: See `get_amp_supported_dtype` in `torch_openreg/openreg/amp/__init__.py`

## Installation and Usage

### Installation

```python
python -m pip install --no-build-isolation -e . # for develop
python -m pip install --no-build-isolation . # for install
```

### Usage Example

After installation, you can use the `openreg` device in Python just like any other regular device.

```python
import torch

if not torch.openreg.is_available():
print("OpenReg backend is not available in this build.")
exit()

print("OpenReg backend is available!")

device = torch.device("openreg")

x = torch.tensor([[1., 2.], [3., 4.]], device=device)
y = x + 2
print("Result y:\n", y)
print(f"Device of y: {y.device}")

z = y.cpu()
print("Result z:\n", z)
print(f"Device of z: {z.device}")
```

## Documentation

Please refer to [this](https://docs.pytorch.org/docs/main/accelerator/index.html) for a series of documents on integrating new accelerators into PyTorch, which will be kept in sync with the `OpenReg` codebase as well.

## Future Plans

- **Enhance Features**:
- Device-agnostic APIs
- Memory Management
- Generator
- Distributed
- Custom Tensor&Storage
- ...
- **Improve Tests**: Add more test cases related to the integration mechanism.
20 changes: 20 additions & 0 deletions source/3rdparty/torch_openreg/cmake/TorchPythonTargets.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
if(WIN32)
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_IMPORT_LIBRARY_PREFIX}torch_python${CMAKE_IMPORT_LIBRARY_SUFFIX}")
else()
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}torch_python${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif()

add_library(torch_python SHARED IMPORTED)

set_target_properties(torch_python PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${PYTORCH_INSTALL_DIR}/include"
INTERFACE_LINK_LIBRARIES "c10;torch_cpu"
IMPORTED_LOCATION "${TORCH_PYTHON_IMPORTED_LOCATION}"
)

add_library(torch_python_library INTERFACE IMPORTED)

set_target_properties(torch_python_library PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch_python,INTERFACE_INCLUDE_DIRECTORIES>"
INTERFACE_LINK_LIBRARIES "\$<TARGET_FILE:torch_python>;\$<TARGET_PROPERTY:torch_python,INTERFACE_LINK_LIBRARIES>"
)
16 changes: 16 additions & 0 deletions source/3rdparty/torch_openreg/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
set(LIBRARY_NAME torch_openreg)

file(GLOB_RECURSE SOURCE_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
)

add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})

target_link_libraries(${LIBRARY_NAME} PRIVATE torch_cpu_library openreg)
target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

install(TARGETS ${LIBRARY_NAME}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
37 changes: 37 additions & 0 deletions source/3rdparty/torch_openreg/csrc/amp/autocast_mode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <ATen/autocast_mode.h>

using at::Tensor;

Tensor binary_cross_entropy_banned(
const Tensor&,
const Tensor&,
const std::optional<Tensor>&,
int64_t) {
TORCH_CHECK(
false,
"torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
"Many models use a sigmoid layer right before the binary cross entropy layer.\n"
"In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
"or torch.nn.BCEWithLogitsLoss. binary_cross_entropy_with_logits and BCEWithLogits are\n"
"safe to autocast.");
}

// LITERALINCLUDE START: AMP FALLTHROUTH
TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
// LITERALINCLUDE END: AMP FALLTHROUTH

// LITERALINCLUDE START: AMP IMPL
TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
// lower_precision_fp
KERNEL_PRIVATEUSEONE(mm, lower_precision_fp)

// fp32
KERNEL_PRIVATEUSEONE(asin, fp32)

m.impl(
TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
TORCH_FN((&binary_cross_entropy_banned)));
}
// LITERALINCLUDE END: AMP IMPL
Loading
Loading