Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions projects/rocprim/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec
* Removed unused `equality`, `inequality`, `sum`, `max`, `min` from thread_operator.hpp.
* Removed duplicate `inequality_operator` from binary_op_warpper.hpp

### Known issues

* benchmark_warp_sort may hang on Navi GPUs on Windows when running logical warp sizes > hardware warp size.

## rocPRIM 4.2.0 for ROCm 7.2

### Added
Expand Down
41 changes: 14 additions & 27 deletions projects/rocprim/benchmark/benchmark_warp_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,18 @@

#include "primbench.hpp"

#define CREATE_SORT_BENCHMARK(K, BS, WS, IPT) executor.queue<warp_sort_benchmark<K, BS, WS, IPT>>();
#define CREATE_SORT_BENCHMARK(K, BS, WS, IPT) \
if(is_warp_size_supported(WS, device_id)) \
{ \
executor.queue<warp_sort_benchmark<K, BS, WS, IPT>>(); \
}

#define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS, IPT) \
executor.queue<warp_sort_benchmark<K, BS, WS, IPT, V>>();
#define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS, IPT) \
if(is_warp_size_supported(WS, device_id)) \
{ \
executor.queue<warp_sort_benchmark<K, BS, WS, IPT, V>>(); \
}

// clang-format off
#ifndef ROCPRIM_NAVI
#define BENCHMARK_TYPE(type) \
CREATE_SORT_BENCHMARK(type, 64, 64, 1) \
CREATE_SORT_BENCHMARK(type, 64, 64, 2) \
Expand All @@ -46,35 +51,14 @@
CREATE_SORT_BENCHMARK(type, 64, 16, 1) \
CREATE_SORT_BENCHMARK(type, 64, 16, 2) \
CREATE_SORT_BENCHMARK(type, 64, 16, 4)
#elif defined(_WIN32) // Currently, running a logical warpSize > hardware warpSize on Windows on Navi cards leads to a hang.
#define BENCHMARK_TYPE(type) \
CREATE_SORT_BENCHMARK(type, 64, 32, 1) \
CREATE_SORT_BENCHMARK(type, 64, 32, 1) \
CREATE_SORT_BENCHMARK(type, 64, 16, 2) \
CREATE_SORT_BENCHMARK(type, 64, 16, 2) \
CREATE_SORT_BENCHMARK(type, 64, 16, 4)
#endif
// clang-format on

// clang-format off
#ifndef ROCPRIM_NAVI
#define BENCHMARK_KEY_TYPE(type, value) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 1) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 2) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 4) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 1) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 2) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 4)
#elif defined(_WIN32) // Currently, running a logical warpSize > hardware warpSize on Windows on Navi cards leads to a hang.
#define BENCHMARK_KEY_TYPE(type, value) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 32, 1) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 32, 2) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 32, 4) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 32, 1) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 32, 2) \
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 32, 4)
#endif
// clang-format on
CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 4)

int main(int argc, char* argv[])
{
Expand All @@ -83,6 +67,9 @@ int main(int argc, char* argv[])
settings.noise_tolerance_percent = 2;
primbench::executor executor(argc, argv, settings);

int device_id;
HIP_CHECK(hipGetDevice(&device_id));

BENCHMARK_TYPE(int32_t)
BENCHMARK_TYPE(float)
BENCHMARK_TYPE(double)
Expand Down
Loading