Skip to content

Commit a47bbf1

Browse files
authored
Develop (#74)
Add blis support for AMD cpus.
1 parent 4750f34 commit a47bbf1

File tree

11 files changed

+108
-172
lines changed

11 files changed

+108
-172
lines changed

3rd/CMakeLists.txt

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,28 @@ SET_PROPERTY(TARGET cnpy PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/
5656
target_include_directories(cnpy INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/cnpy/include/)
5757
add_dependencies(cnpy extern_cnpy)
5858

59-
6059
if (${BLAS_PROVIDER} STREQUAL "openblas")
6160
include(openblas.cmake)
6261
include(eigen.cmake)
6362
endif()
63+
64+
65+
if (${BLAS_PROVIDER} STREQUAL "blis")
66+
message(STATUS "CMAKE_CURRENT_BINARY_DIR " ${CMAKE_CURRENT_BINARY_DIR})
67+
ExternalProject_Add(extern_blis
68+
GIT_REPOSITORY https://github.com/flame/blis.git
69+
GIT_TAG 0.7.0
70+
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
71+
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
72+
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/blis
73+
CONFIGURE_COMMAND COMMAND ""
74+
BUILD_COMMAND COMMAND ./configure --enable-threading=openmp --enable-cblas --prefix=${CMAKE_CURRENT_BINARY_DIR}/blis auto && make -j ${nproc} && make check -j ${nproc}
75+
INSTALL_COMMAND COMMAND make install
76+
BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/blis/lib/libblis.a)
77+
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/blis/include/blis/)
78+
add_library(blis STATIC IMPORTED GLOBAL)
79+
SET_PROPERTY(TARGET blis PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/blis/lib/libblis.a)
80+
target_include_directories(blis INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/blis/include/blis/)
81+
add_dependencies(blis extern_blis)
82+
include(eigen.cmake)
83+
endif()

CMakeLists.txt

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ set(CMAKE_CXX_STANDARD 14)
2121
set(CMAKE_CXX_FLAGS "-Wall")
2222
set(CMAKE_C_FLAGS "-Wall")
2323

24-
set(TURBO_TRANSFORMERS_VERSION 0.2.0)
24+
set(TURBO_TRANSFORMERS_VERSION 0.2.1)
2525

2626
option(WITH_PROFILER "Compile with gperftools" OFF)
2727
option(WITH_GPU "Build with GPU" OFF)
28-
option(WITH_MODULE_BENCHMAKR "Build with GPU" ON)
28+
option(WITH_MODULE_BENCHMAKR "Catch2 unitest with benchmarking" ON)
2929

3030

3131
if (WITH_GPU)
@@ -42,14 +42,12 @@ if(WITH_GPU)
4242
endif()
4343

4444
set(MKLROOT "/opt/intel/mkl" CACHE PATH "The mkl library root")
45-
set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl]")
45+
set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl, blis]")
4646
if (${BLAS_PROVIDER} STREQUAL "mkl")
4747
find_package(MKL REQUIRED)
4848
endif()
49-
if (${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
50-
message(STATUS "Fast Transformer is built with a intel compiler!")
51-
add_definitions(-D__USE_INTEL_COMPILER__)
52-
endif ()
49+
50+
message(STATUS "Blas provider is ${BLAS_PROVIDER}")
5351

5452
add_subdirectory(3rd)
5553
include_directories(3rd/FP16/include)
@@ -65,7 +63,6 @@ else ()
6563
message(WARNING "OpenMP is not supported")
6664
endif ()
6765

68-
message(STATUS "Blas provider is ${BLAS_PROVIDER}")
6966

7067
if (WITH_PROFILER)
7168
find_package(Gperftools REQUIRED)

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
## turbo_transformers: a fast and user-friendly tool for transformer inference on CPU and GPU
2-
[Chinese Version](./README_cn.md)
32
![logo](./images/logo.jpeg)
43

54
### **make transformers serving fast by adding a turbo to your inference engine!**
@@ -43,14 +42,15 @@ Method 1:I want to unitest
4342
cd /workspace
4443
sh tools/build_and_run_unittests.sh $PWD -DWITH_GPU=OFF
4544
# you can switch between Openblas and MKL by modifying this line in CMakeList.txt
46-
# set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl]")
45+
# set(BLAS_PROVIDER "mkl" CACHE STRING "Set the blas provider library, in [openblas, mkl, blis]")
46+
4747
```
4848
Method 2:I do not want to unitest
4949
```
5050
cd /workspace
5151
mkdir -p build && cd build
5252
cmake .. -DWITH_GPU=OFF
53-
make -j 4
53+
make -j 4
5454
pip install `find . -name *whl`
5555
```
5656
3. Run benchmark (optional) in docker, compare with pytorch, torch-JIT, onnxruntime
@@ -67,7 +67,7 @@ sh tool/build_conda_package.sh
6767

6868
*We also prepared a docker image containing CPU version of TurboTransformers, as well as other related works, i.e. onnxrt v1.2.0 and pytorch-jit on dockerhub*
6969
```
70-
*docker pull thufeifeibear/turbo_transformers:0.2.0-release-cpu-dev*
70+
docker pull thufeifeibear/turbo_transformers:0.2.0-release-cpu-dev
7171
```
7272
### Installation on GPU
7373
```
@@ -94,7 +94,7 @@ bash gpu_run_benchmark.sh
9494
```
9595
*We also prepared a docker image containing GPU version of TurboTransformers.
9696
```
97-
*docker pull thufeifeibear/turbo_transformers:0.2.0-cuda10.0-cudnn7-devel-ubuntu18.04-gpu-release*
97+
docker pull thufeifeibear/turbo_transformers:0.2.0-cuda10.0-cudnn7-devel-ubuntu18.04-gpu-release
9898
```
9999

100100
### Usage
@@ -170,4 +170,4 @@ weight = torch.clone(torch.t(pooler_params['dense.weight']))
170170

171171
## Contact us
172172
Although we recommand you post your problem with github issues, you can also join in our Turbo user group.
173-
Scan this [QR code](./images/namecode.pdf "qrcode") and our contactor as your WeChat friend.
173+
Scan this [QR code](./images/namecode.pdf "qrcode") and add our contactor as your WeChat friend.

README_cn.md

Lines changed: 0 additions & 146 deletions
This file was deleted.

turbo_transformers/core/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ if (${BLAS_PROVIDER} STREQUAL "mkl")
4141
elseif (${BLAS_PROVIDER} STREQUAL "openblas")
4242
target_link_libraries(tt_core PUBLIC OpenBlas::OpenBlas PUBLIC Eigen3::Eigen)
4343
target_compile_definitions(tt_core PUBLIC -DTT_BLAS_USE_OPENBLAS)
44+
elseif (${BLAS_PROVIDER} STREQUAL "blis")
45+
target_link_libraries(tt_core PUBLIC blis Eigen3::Eigen)
46+
target_compile_definitions(tt_core PUBLIC -DTT_BLAS_USE_BLIS)
4447
endif ()
4548

4649

turbo_transformers/core/blas.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,22 @@ namespace turbo_transformers {
2020
using BlasInt = MKL_INT;
2121
}
2222

23-
#else
23+
#elif defined(TT_BLAS_USE_OPENBLAS) || defined(TT_BLAS_USE_BLIS)
2424
#include "cblas.h"
25+
#if defined(TT_BLAS_USE_OPENBLAS)
26+
2527
namespace turbo_transformers {
2628
using BlasInt = blasint;
2729
} // namespace turbo_transformers
30+
#elif defined(TT_BLAS_USE_BLIS)
31+
#include <unistd.h>
32+
33+
namespace turbo_transformers {
34+
using BlasInt = f77_int;
35+
} // namespace turbo_transformers
36+
37+
using blasint = turbo_transformers::BlasInt;
38+
#endif
2839

2940
extern "C" {
3041
void cblas_sgemm_batch(const CBLAS_ORDER Layout,
@@ -39,5 +50,5 @@ void cblas_sgemm_batch(const CBLAS_ORDER Layout,
3950
const blasint* group_size);
4051
void vsTanh(blasint N, const float* in, float* out);
4152
}
42-
53+
#else
4354
#endif
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright (C) 2020 THL A29 Limited, a Tencent company.
2+
// All rights reserved.
3+
// Licensed under the BSD 3-Clause License (the "License"); you may
4+
// not use this file except in compliance with the License. You may
5+
// obtain a copy of the License at
6+
// https://opensource.org/licenses/BSD-3-Clause
7+
// Unless required by applicable law or agreed to in writing, software
8+
// distributed under the License is distributed on an "AS IS" basis,
9+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
10+
// implied. See the License for the specific language governing
11+
// permissions and limitations under the License.
12+
// See the AUTHORS file for names of contributors.
13+
#include "blas.h"
14+
#define EIGEN_DONT_PARALLELIZE
15+
#include "unsupported/Eigen/CXX11/Tensor"
16+
extern "C" {
17+
void cblas_sgemm_batch(const CBLAS_ORDER Layout,
18+
const CBLAS_TRANSPOSE* transa_array,
19+
const CBLAS_TRANSPOSE* transb_array,
20+
const blasint* m_array, const blasint* n_array,
21+
const blasint* k_array, const float* alpha_array,
22+
const float** a_array, const blasint* lda_array,
23+
const float** b_array, const blasint* ldb_array,
24+
const float* beta_array, float** c_array,
25+
const blasint* ldc_array, const blasint group_count,
26+
const blasint* group_size) {
27+
int idx = 0;
28+
for (int i = 0; i < group_count; ++i) {
29+
auto alpha = alpha_array[i];
30+
auto beta = beta_array[i];
31+
for (int j = 0; j < group_size[i]; ++j) {
32+
cblas_sgemm(Layout, transa_array[i], transb_array[i], m_array[i],
33+
n_array[i], k_array[i], alpha, a_array[idx], lda_array[i],
34+
b_array[idx], ldb_array[i], beta, c_array[idx], ldc_array[i]);
35+
++idx;
36+
}
37+
}
38+
}
39+
40+
using Vec = Eigen::TensorMap<Eigen::Tensor<float, 1>>;
41+
42+
void vsTanh(blasint N, const float* in, float* out) {
43+
Vec input(const_cast<float*>(in), N);
44+
Vec output(out, N);
45+
46+
// let use eigen to calculate tanh.
47+
// Eigen can use `FAST_MATH`.
48+
output = input.tanh();
49+
}
50+
}

turbo_transformers/core/config.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ void SetNumThreads(int n_th) {
2424
// The order seems important. Set MKL NUM_THREADS before OMP.
2525
#ifdef TT_BLAS_USE_MKL
2626
mkl_set_num_threads(n_th);
27-
#else
27+
#elif TT_BLAS_USE_OPENBLAS
2828
openblas_set_num_threads(n_th);
29+
#elif TT_BLAS_USE_BLIS
2930
#endif
3031
#ifdef _OPENMP
3132
omp_set_num_threads(n_th);
@@ -37,6 +38,8 @@ BlasProvider GetBlasProvider() {
3738
return BlasProvider::MKL;
3839
#elif defined(TT_BLAS_USE_OPENBLAS)
3940
return BlasProvider::OpenBlas;
41+
#elif defined(TT_BLAS_USE_BLIS)
42+
return BlasProvider::BLIS;
4043
#else
4144
#error "unexpected code";
4245
#endif

turbo_transformers/core/config.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,7 @@
1414
#pragma once
1515
namespace turbo_transformers {
1616
namespace core {
17-
enum class BlasProvider {
18-
MKL,
19-
OpenBlas,
20-
};
17+
enum class BlasProvider { MKL, OpenBlas, BLIS };
2118

2219
BlasProvider GetBlasProvider();
2320

turbo_transformers/loaders/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@
1212
# See the AUTHORS file for names of contributors.
1313
add_library(tt_npz_loader npz_load.cpp)
1414
target_link_libraries(tt_npz_loader
15-
PUBLIC dlpack cnpy
16-
PRIVATE tt_core zlib
15+
PUBLIC dlpack cnpy zlib
16+
PRIVATE tt_core
1717
)

0 commit comments

Comments
 (0)