diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt index b34e5ddf2325..fa8383e82c05 100644 --- a/projects/miopen/src/CMakeLists.txt +++ b/projects/miopen/src/CMakeLists.txt @@ -574,8 +574,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenGetitem.cpp kernels/MIOpenKthvalue.cpp kernels/MIOpenLayerNorm.cpp - kernels/MIOpenLRNBwd.cl - kernels/MIOpenLRNFwd.cl + kernels/MIOpenLRNBwd.cpp + kernels/MIOpenLRNFwd.cpp kernels/MIOpenMultiMarginLoss.cpp kernels/MIOpenNeuron.cpp kernels/MIOpenPReLU.cpp diff --git a/projects/miopen/src/include/miopen/mlo_internal.hpp b/projects/miopen/src/include/miopen/mlo_internal.hpp index 89988f80a877..62ccab567922 100644 --- a/projects/miopen/src/include/miopen/mlo_internal.hpp +++ b/projects/miopen/src/include/miopen/mlo_internal.hpp @@ -557,8 +557,8 @@ struct mlo_construct_norm : mlo_construct_activ_lrn_pooling_common void mloConstruct(); protected: - int mloConstructFwd(); - int mloConstructBwd(); + void mloConstructFwd(); + void mloConstructBwd(); int _norm_region = 0; int _norm_area = 0; double _normAlpha = 0.0; diff --git a/projects/miopen/src/kernels/MIOpenLRNBwd.cl b/projects/miopen/src/kernels/MIOpenLRNBwd.cl deleted file mode 100644 index 4e60bf133a66..000000000000 --- a/projects/miopen/src/kernels/MIOpenLRNBwd.cl +++ /dev/null @@ -1,472 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#define PPCAT_NX(A, B) A##B -#define PPCAT(A, B) PPCAT_NX(A, B) -#define TWO 2 -#define THREE 3 -#define FOUR 4 -#define EIGHT 8 - -#define DBG_RANGE 0 -#if MIOPEN_USE_FP16 == 1 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define _FLOAT half -#endif -#if MIOPEN_USE_FP32 == 1 -#define _FLOAT float -#endif - -#define _FLOAT2 PPCAT(_FLOAT, TWO) -#define _FLOAT3 PPCAT(_FLOAT, THREE) -#define _FLOAT4 PPCAT(_FLOAT, FOUR) -#define _FLOAT8 PPCAT(_FLOAT, EIGHT) - -#define UNUSED __attribute__((__unused__)) - -#define MLO_LRN_GROUP_SZ2 1 -#define MLO_LRN_STRIDE 1 - -#define MLO_LRN_LCL_DATA_WIDTH (MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_KERNEL_SZ - 1) -#define MLO_LRN_LCL_DATA_HEIGHT (MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX + MLO_LRN_KERNEL_SZ - 1) -#define MLO_LRN_GROUP_SZ (MLO_LRN_GROUP_SZ2 * MLO_LRN_GROUP_SZ1 * MLO_LRN_GROUP_SZ0) -// #define MLO_LRN_PREPAD_SZ (MLO_LRN_KERNEL_SZ - 1)/2 - -struct LRNForwardParam -{ - _FLOAT alphaoverarea; - _FLOAT alpha; - _FLOAT beta; - _FLOAT K; -}; - -struct LRNBackwardParam -{ - _FLOAT ratio; - _FLOAT alpha; - _FLOAT beta; -}; - -/* - -This is a naive implementation. -The "sliding window" -based implementation is in MIOpenLRNFwd.cl file - -*/ - -__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2))) -__kernel void -MIOpenLRNWithinChannelBwd(const __global _FLOAT* top, - const __global _FLOAT* bot, - const __global _FLOAT* top_df, - const __global _FLOAT* scale, - __global _FLOAT* bot_df, - UNUSED _FLOAT ratio, - _FLOAT alpha, - _FLOAT beta) -{ - __local _FLOAT top_df_data[MLO_LRN_LCL_DATA_WIDTH * MLO_LRN_LCL_DATA_HEIGHT]; - __local _FLOAT ratio_data[MLO_LRN_LCL_DATA_WIDTH * MLO_LRN_LCL_DATA_HEIGHT]; - int x = get_group_id(0) * MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX; - int y = get_group_id(1) * MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX; - int lcl_id0 = get_local_id(0); - int lcl_id1 = get_local_id(1); - int ob = get_global_id(2); // output * batch_sz - int o = ob / MLO_LRN_BATCH_SZ; - int b = ob - o * MLO_LRN_BATCH_SZ; - int top_x = x; - int top_y = y; - int top_df_off = b * MLO_LRN_TOPDF_BATCH_STRIDE + o * MLO_LRN_TOPDF_CHANNEL_STRIDE; - int scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + o * MLO_LRN_SCALE_CHANNEL_STRIDE; - int bot_x = x + lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX; - int bot_y = y + lcl_id1 * MLO_LRN_N_VERT_OUT_PIX; - - _FLOAT prv_exp_scale[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX]; - // _FLOAT prv_top_df[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX]; - - // load top_diff and scale tiles - for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1) - { - int top_y_act = top_y + b_j - MLO_LRN_PAD; - - bool invisibleY = (top_y_act < 0) || (top_y_act >= MLO_LRN_TOP_HEIGHT); - - top_y_act = (invisibleY) ? 0 : top_y_act; - - int top_df_y_off = top_y_act * MLO_LRN_TOPDF_STRIDE; - int scale_y_off = top_y_act * MLO_LRN_SCALE_STRIDE; - - int lcl_off_v = b_j * MLO_LRN_LCL_DATA_WIDTH; - - for(int b_i = lcl_id0; b_i < MLO_LRN_LCL_DATA_WIDTH; b_i += MLO_LRN_GROUP_SZ0) - { - - int top_x_act = top_x + b_i - MLO_LRN_PAD; - - bool invisibleX = (top_x_act < 0) || (top_x_act >= MLO_LRN_TOP_WIDTH); - - top_x_act = (invisibleX) ? 0 : top_x_act; -#if DBG_RANGE - if(top_df_off + top_df_y_off + top_x_act >= - MLO_LRN_BATCH_SZ * MLO_LRN_TOPDF_BATCH_STRIDE) - { - printf("K:err:topdf-off_range\n"); - } -#endif - _FLOAT top_df_val = top_df[top_df_off + top_df_y_off + top_x_act]; - _FLOAT scale_val = scale[scale_off + scale_y_off + top_x_act]; - - top_df_val = (invisibleX || invisibleY) ? 0 : top_df_val; - scale_val = (invisibleX || invisibleY) ? (_FLOAT)1.f : scale_val; - - top_df_data[lcl_off_v + b_i] = top_df_val; - ratio_data[lcl_off_v + b_i] = scale_val; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // actual top_diffs and scales - for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j) - { - int lcl_off_v = - (lcl_id1 * MLO_LRN_N_VERT_OUT_PIX + MLO_LRN_PAD + j) * MLO_LRN_LCL_DATA_WIDTH; - for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++) - { - _FLOAT scale_ratio = - ratio_data[lcl_off_v + lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_PAD + i]; - prv_exp_scale[j][i] = exp(-beta * log(scale_ratio)); - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - // read top and load ratio tile - int top_off = b * MLO_LRN_TOP_BATCH_STRIDE + o * MLO_LRN_TOP_CHANNEL_STRIDE; - for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1) - { - int top_y_act = top_y + b_j - MLO_LRN_PAD; - - bool invisibleY = (top_y_act < 0) || (top_y_act >= MLO_LRN_TOP_HEIGHT); - - top_y_act = (invisibleY) ? 0 : top_y_act; - - int top_y_off = top_y_act * MLO_LRN_TOP_STRIDE; - - int lcl_off_v = b_j * MLO_LRN_LCL_DATA_WIDTH; - - for(int b_i = lcl_id0; b_i < MLO_LRN_LCL_DATA_WIDTH; b_i += MLO_LRN_GROUP_SZ0) - { - - int top_x_act = top_x + b_i - MLO_LRN_PAD; - - bool invisibleX = (top_x_act < 0) || (top_x_act >= MLO_LRN_TOP_WIDTH); - - top_x_act = (invisibleX) ? 0 : top_x_act; -#if DBG_RANGE - - if(top_off + top_y_off + top_x_act >= MLO_LRN_BATCH_SZ * MLO_LRN_TOP_BATCH_STRIDE) - { - printf("K:err:top-off_range\n"); - } -#endif - - _FLOAT top_val = top[top_off + top_y_off + top_x_act]; - - top_val = (invisibleX || invisibleY) ? 0 : top_val; - - _FLOAT top_df_val = top_df_data[lcl_off_v + b_i]; - - _FLOAT scale_val = ratio_data[lcl_off_v + b_i]; - - // scale val is not 0 - _FLOAT ratio_dta = (top_df_val * top_val) / scale_val; - // replacing scale with ratio - ratio_data[lcl_off_v + b_i] = ratio_dta; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // caculate bot diff - _FLOAT prv_bot_diff[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX]; - - for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j) - { - int v_off_v = (lcl_id1 * MLO_LRN_N_VERT_OUT_PIX + j); - int hstart = y + v_off_v - MLO_LRN_PAD; - int hend = min(hstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_HEIGHT + MLO_LRN_PRE_PAD); - - // accum offset, vertical - // int lcl_a_off_v = v_off_v * MLO_LRN_LCL_DATA_WIDTH; - // value offset, vertical - int lcl_v_off_v = (v_off_v + MLO_LRN_PAD) * MLO_LRN_LCL_DATA_WIDTH; - for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++) - { - _FLOAT prv_ratio_accum = (_FLOAT)0; - int v_off_h = lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + i; - - int wstart = x + v_off_h - MLO_LRN_PAD; - int wend = min(wstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_WIDTH + MLO_LRN_PRE_PAD); - - int adj_area_size = (hend - hstart) * (wend - wstart); - - // accum offset, horiz - int lcl_a_off_h = v_off_h; - // value offset, horiz - int lcl_v_off_h = lcl_a_off_h + MLO_LRN_PAD; - - for(int k = 0; k < MLO_LRN_KERNEL_SZ; k++) - { - for(int l = 0; l < MLO_LRN_KERNEL_SZ; l++) - { - prv_ratio_accum += - ratio_data[(v_off_v + k) * MLO_LRN_LCL_DATA_WIDTH + lcl_a_off_h + l]; - } - } - - _FLOAT top_df_val = top_df_data[lcl_v_off_v + lcl_v_off_h]; - - uint bot_off0 = MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * o + - MLO_LRN_BOT_STRIDE * (y + v_off_v) + x + v_off_h; - - uint bot_off = (y + v_off_v < MLO_LRN_BOT_HEIGHT && x + v_off_h < MLO_LRN_BOT_WIDTH && - b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS) - ? bot_off0 - : MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE - 1; -#if DBG_RANGE - - if(bot_off >= MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE) - { - printf("K:err:bot-off_range\n"); - } -#endif - _FLOAT bot_dta = bot[bot_off]; - - bot_dta = (y + v_off_v < MLO_LRN_BOT_HEIGHT && x + v_off_h < MLO_LRN_BOT_WIDTH && - b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS) - ? bot_dta - : 0; - - _FLOAT adj_ratio = (_FLOAT)2.f * alpha * beta / adj_area_size; - _FLOAT prv_accum_ratio = adj_ratio * bot_dta * prv_ratio_accum; - prv_bot_diff[j][i] = prv_exp_scale[j][i] * top_df_val - prv_accum_ratio; - } - } - - for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; j++) - { - for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++) - { - if(bot_y + j < MLO_LRN_BOT_HEIGHT && bot_x + i < MLO_LRN_BOT_WIDTH && - b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS) - { -#if DBG_RANGE - - if(MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * o + - MLO_LRN_BOTDF_STRIDE * (bot_y + j) + bot_x + i >= - MLO_LRN_BATCH_SZ * MLO_LRN_BOTDF_BATCH_STRIDE) - { - printf("K:err:botdf-off_range\n"); - } -#endif - bot_df[MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * o + - MLO_LRN_BOTDF_STRIDE * (bot_y + j) + bot_x + i] = prv_bot_diff[j][i]; - } - } - } -} - -#if(MLO_LRN_N_INPUTS < MLO_LRN_KERNEL_SZ) -#define MLO_LOW_CHNL_COUNT 1 -#else -#define MLO_LOW_CHNL_COUNT 0 -#endif - -__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2))) -__kernel void -MIOpenLRNAcrossChannelsBwd1(const __global _FLOAT* top, - const __global _FLOAT* bot, - const __global _FLOAT* top_df, - const __global _FLOAT* scale, - __global _FLOAT* bot_df, - _FLOAT ratio, - UNUSED _FLOAT alpha, - _FLOAT beta) -{ - int x = get_global_id(0); // channel x - int y = get_global_id(1); // channel y - int b = get_global_id(2); // batch - _FLOAT accum_ratio = 0; - _FLOAT top_df_in[MLO_LRN_KERNEL_SZ]; - _FLOAT scale_in[MLO_LRN_KERNEL_SZ]; - _FLOAT ratio_dta[MLO_LRN_KERNEL_SZ]; - int c_i = 0, c_o = 0; - int bot_df_off = 0; - - for(c_i = 0; c_i < MLO_LRN_PRE_PAD; c_i++) - { - - top_df_in[c_i] = top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b + - MLO_LRN_TOPDF_CHANNEL_STRIDE * c_i + MLO_LRN_TOPDF_STRIDE * y + x]; - scale_in[c_i] = scale[MLO_LRN_SCALE_BATCH_STRIDE * b + MLO_LRN_SCALE_CHANNEL_STRIDE * c_i + - MLO_LRN_SCALE_STRIDE * y + x]; - _FLOAT top_dta = top[MLO_LRN_TOP_BATCH_STRIDE * b + MLO_LRN_TOP_CHANNEL_STRIDE * c_i + - MLO_LRN_TOP_STRIDE * y + x]; - - ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i]; - -#if MLO_LOW_CHNL_COUNT == 1 - ratio_dta[c_i] = (c_i < MLO_LRN_N_OUTPUTS) ? ratio_dta[c_i] : 0; -#endif - - accum_ratio = accum_ratio + ratio_dta[c_i]; - } - - for(; c_i < MLO_LRN_KERNEL_SZ; c_i++, c_o++) - { - top_df_in[c_i] = top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b + - MLO_LRN_TOPDF_CHANNEL_STRIDE * c_i + MLO_LRN_TOPDF_STRIDE * y + x]; - scale_in[c_i] = scale[MLO_LRN_SCALE_BATCH_STRIDE * b + MLO_LRN_SCALE_CHANNEL_STRIDE * c_i + - MLO_LRN_SCALE_STRIDE * y + x]; - _FLOAT top_dta = top[MLO_LRN_TOP_BATCH_STRIDE * b + MLO_LRN_TOP_CHANNEL_STRIDE * c_i + - MLO_LRN_TOP_STRIDE * y + x]; - ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i]; -#if MLO_LOW_CHNL_COUNT == 1 - ratio_dta[c_i] = (c_i < MLO_LRN_N_OUTPUTS) ? ratio_dta[c_i] : 0; -#endif - - accum_ratio = accum_ratio + ratio_dta[c_i]; -#if MLO_LOW_CHNL_COUNT == 1 - if(c_o < MLO_LRN_N_INPUTS) -#endif - { - _FLOAT bot_dta = bot[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_o + - MLO_LRN_BOT_STRIDE * y + x]; - - _FLOAT prv_scale = scale_in[c_o]; - - _FLOAT exp_scale = exp(-beta * log(prv_scale)); - // pow(prv_scale, -beta); - - _FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio; - - _FLOAT out_val = top_df_in[c_o] * exp_scale - prv_accum_ratio; - - bot_df_off = MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * c_o + - MLO_LRN_BOTDF_STRIDE * y + x; - - bot_df[bot_df_off] = out_val; - } - } - - for(; c_i < MLO_LRN_N_INPUTS; c_i++, c_o++) - { - - _FLOAT prv_top_df_in = - top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b + MLO_LRN_TOPDF_CHANNEL_STRIDE * c_i + - MLO_LRN_TOPDF_STRIDE * y + x]; - _FLOAT prv_scale_in = - scale[MLO_LRN_SCALE_BATCH_STRIDE * b + MLO_LRN_SCALE_CHANNEL_STRIDE * c_i + - MLO_LRN_SCALE_STRIDE * y + x]; - _FLOAT top_dta = top[MLO_LRN_TOP_BATCH_STRIDE * b + MLO_LRN_TOP_CHANNEL_STRIDE * c_i + - MLO_LRN_TOP_STRIDE * y + x]; - _FLOAT prv_ratio_dta = prv_top_df_in * top_dta / prv_scale_in; -#if MLO_LOW_CHNL_COUNT == 1 - prv_ratio_dta = (c_i < MLO_LRN_N_OUTPUTS) ? prv_ratio_dta : 0; -#endif - - accum_ratio = accum_ratio + prv_ratio_dta; - - accum_ratio = accum_ratio - ratio_dta[0]; - - for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++) - { - top_df_in[i] = top_df_in[i + 1]; - scale_in[i] = scale_in[i + 1]; - ratio_dta[i] = ratio_dta[i + 1]; - } - - top_df_in[MLO_LRN_KERNEL_SZ - 1] = prv_top_df_in; - scale_in[MLO_LRN_KERNEL_SZ - 1] = prv_scale_in; - ratio_dta[MLO_LRN_KERNEL_SZ - 1] = prv_ratio_dta; - -#if MLO_LOW_CHNL_COUNT == 1 - if(c_o < MLO_LRN_N_INPUTS) -#endif - { - _FLOAT bot_dta = bot[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_o + - MLO_LRN_BOT_STRIDE * y + x]; - - _FLOAT prv_scale = scale_in[MLO_LRN_PAD]; - - _FLOAT exp_scale = exp(-beta * log(prv_scale)); - // pow(prv_scale,-beta); - - _FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio; - - _FLOAT out_val = top_df_in[MLO_LRN_PAD] * exp_scale - prv_accum_ratio; - - bot_df_off = MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * c_o + - MLO_LRN_BOTDF_STRIDE * y + x; - - bot_df[bot_df_off] = out_val; - } - } - - for(; c_i < MLO_LRN_N_INPUTS + MLO_LRN_PRE_PAD; c_i++, c_o++) - { - - accum_ratio = accum_ratio - ratio_dta[0]; - - for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++) - { - top_df_in[i] = top_df_in[i + 1]; - scale_in[i] = scale_in[i + 1]; - ratio_dta[i] = ratio_dta[i + 1]; - } - -#if MLO_LOW_CHNL_COUNT == 1 - if(c_o < MLO_LRN_N_INPUTS) -#endif - { - _FLOAT bot_dta = bot[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_o + - MLO_LRN_BOT_STRIDE * y + x]; - - _FLOAT prv_scale = scale_in[MLO_LRN_PAD]; - - _FLOAT exp_scale = exp(-beta * log(prv_scale)); - // pow(prv_scale,-beta); - - _FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio; - - _FLOAT out_val = top_df_in[MLO_LRN_PAD] * exp_scale - prv_accum_ratio; - - bot_df_off = MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * c_o + - MLO_LRN_BOTDF_STRIDE * y + x; - - bot_df[bot_df_off] = out_val; - } - } -} diff --git a/projects/miopen/src/kernels/MIOpenLRNBwd.cpp b/projects/miopen/src/kernels/MIOpenLRNBwd.cpp new file mode 100644 index 000000000000..bb23a3f7b324 --- /dev/null +++ b/projects/miopen/src/kernels/MIOpenLRNBwd.cpp @@ -0,0 +1,399 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef MIOPEN_HIP_RUNTIME_COMPILE +#include +#include +#endif + +#include "miopen_math.hpp" + +constexpr unsigned group_size_z = 1u; +constexpr unsigned local_data_width = GROUP_SIZE_X * HORIZ_OUT_PIX + KERNEL_SIZE - 1; +constexpr unsigned local_data_height = GROUP_SIZE_Y * VERT_OUT_PIX + KERNEL_SIZE - 1; +constexpr bool low_channel_count = N_INPUTS < KERNEL_SIZE; + +/* +This is a naive implementation. +The "sliding window" -based implementation is in MIOpenLRNFwd.cpp file +*/ +__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__ + void MIOpenLRNWithinChannelBwd(const FLOAT* top, + const FLOAT* bot, + const FLOAT* top_df, + const FLOAT* scale, + FLOAT* bot_df, + FLOAT /*ratio*/, + FLOAT alpha, + FLOAT beta) +{ + __shared__ FLOAT top_df_data[local_data_width * local_data_height]; + __shared__ FLOAT ratio_data[local_data_width * local_data_height]; + const int x = blockIdx.x * GROUP_SIZE_X * HORIZ_OUT_PIX; + const int y = blockIdx.y * GROUP_SIZE_Y * VERT_OUT_PIX; + const int lcl_id0 = threadIdx.x; + const int lcl_id1 = threadIdx.y; + const int ob = blockIdx.z * group_size_z + threadIdx.z; // output * batch_sz + const int o = ob / BATCH_SIZE; + const int b = ob - o * BATCH_SIZE; + const int top_x = x; + const int top_y = y; + const int top_df_off = b * TOPDF_BATCH_STRIDE + o * TOPDF_CHANNEL_STRIDE; + const int scale_off = b * SCALE_BATCH_STRIDE + o * SCALE_CHANNEL_STRIDE; + const int bot_x = x + lcl_id0 * HORIZ_OUT_PIX; + const int bot_y = y + lcl_id1 * VERT_OUT_PIX; + +// load top_diff and scale tiles +#pragma unroll VERT_OUT_PIX + for(int b_j = lcl_id1; b_j < local_data_height; b_j += GROUP_SIZE_Y) + { + int top_y_act = top_y + b_j - PAD; + const bool invisibleY = (top_y_act < 0) || (top_y_act >= TOP_HEIGHT); + top_y_act = (invisibleY) ? 0 : top_y_act; + + const int top_df_y_off = top_y_act * TOPDF_STRIDE; + const int scale_y_off = top_y_act * SCALE_STRIDE; + const int lcl_off_v = b_j * local_data_width; + +/* + Note: Code duplication from manual loop peeling + + The HIP compiler does not unroll this loop if the loop statement is written as + `for(int b_i = lcl_id0; b_i < local_data_width; b_i += GROUP_SIZE_X)`, leading + to a loss in performance. + + To enable unrolling the last loop iteration is manually peeled. As this is the + iteration that may not be uniformly executed by all work-items in a work-group. + + Unfortunately there is code duplication in the body is the loop/if statements, as + refactoring it into a __alwaysinline__ free function seems inhibits compiler + optimizations. +*/ +#pragma unroll + for(int i = 0; i < HORIZ_OUT_PIX; i++) + { + const int b_i = i * GROUP_SIZE_X + lcl_id0; + int top_x_act = top_x + b_i - PAD; + const bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH); + top_x_act = (invisibleX) ? 0 : top_x_act; + + FLOAT top_df_val = top_df[top_df_off + top_df_y_off + top_x_act]; + FLOAT scale_val = scale[scale_off + scale_y_off + top_x_act]; + + top_df_val = (invisibleX || invisibleY) ? 0 : top_df_val; + scale_val = (invisibleX || invisibleY) ? FLOAT(1.f) : scale_val; + + top_df_data[lcl_off_v + b_i] = top_df_val; + ratio_data[lcl_off_v + b_i] = scale_val; + } + if(lcl_id0 < (KERNEL_SIZE - 1)) + { + const int b_i = GROUP_SIZE_X * HORIZ_OUT_PIX + lcl_id0; + int top_x_act = top_x + b_i - PAD; + const bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH); + top_x_act = (invisibleX) ? 0 : top_x_act; + + FLOAT top_df_val = top_df[top_df_off + top_df_y_off + top_x_act]; + FLOAT scale_val = scale[scale_off + scale_y_off + top_x_act]; + + top_df_val = (invisibleX || invisibleY) ? 0 : top_df_val; + scale_val = (invisibleX || invisibleY) ? FLOAT(1.f) : scale_val; + + top_df_data[lcl_off_v + b_i] = top_df_val; + ratio_data[lcl_off_v + b_i] = scale_val; + } + } + + __syncthreads(); + + // actual top_diffs and scales + FLOAT prv_exp_scale[VERT_OUT_PIX][HORIZ_OUT_PIX]; + for(int j = 0; j < VERT_OUT_PIX; ++j) + { + const int lcl_off_v = (lcl_id1 * VERT_OUT_PIX + PAD + j) * local_data_width; + for(int i = 0; i < HORIZ_OUT_PIX; i++) + { + const FLOAT scale_ratio = ratio_data[lcl_off_v + lcl_id0 * HORIZ_OUT_PIX + PAD + i]; + const FLOAT scale_ratio_log = miopen::detail::log(scale_ratio); + prv_exp_scale[j][i] = miopen::detail::exp(-beta * scale_ratio_log); + } + } + + __syncthreads(); + + // read top and load ratio tile + const int top_off = b * TOP_BATCH_STRIDE + o * TOP_CHANNEL_STRIDE; +#pragma unroll VERT_OUT_PIX + for(int b_j = lcl_id1; b_j < local_data_height; b_j += GROUP_SIZE_Y) + { + int top_y_act = top_y + b_j - PAD; + const bool invisibleY = (top_y_act < 0) || (top_y_act >= TOP_HEIGHT); + top_y_act = (invisibleY) ? 0 : top_y_act; + + const int top_y_off = top_y_act * TOP_STRIDE; + const int lcl_off_v = b_j * local_data_width; +/* + Note: Code duplication from manual loop peeling + + The HIP compiler does not unroll this loop if the loop statement is written as + `for(int b_i = lcl_id0; b_i < local_data_width; b_i += GROUP_SIZE_X)`, leading + to a loss in performance. + + To enable unrolling the last loop iteration is manually peeled. As this is the + iteration that may not be uniformly executed by all work-items in a work-group. + + Unfortunately there is code duplication in the body is the loop/if statements, as + refactoring it into a __alwaysinline__ free function seems inhibits compiler + optimizations. +*/ +#pragma unroll + for(int i = 0; i < HORIZ_OUT_PIX; i++) + { + int b_i = i * GROUP_SIZE_X + lcl_id0; + int top_x_act = top_x + b_i - PAD; + bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH); + top_x_act = (invisibleX) ? 0 : top_x_act; + + FLOAT top_val = top[top_off + top_y_off + top_x_act]; + top_val = (invisibleX || invisibleY) ? 0 : top_val; + + const FLOAT top_df_val = top_df_data[lcl_off_v + b_i]; + const FLOAT scale_val = ratio_data[lcl_off_v + b_i]; + + // scale val is not 0 + FLOAT ratio_dta = (top_df_val * top_val) / scale_val; + // replacing scale with ratio + ratio_data[lcl_off_v + b_i] = ratio_dta; + } + if(lcl_id0 < (KERNEL_SIZE - 1)) + { + int b_i = GROUP_SIZE_X * HORIZ_OUT_PIX + lcl_id0; + int top_x_act = top_x + b_i - PAD; + bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH); + top_x_act = (invisibleX) ? 0 : top_x_act; + + FLOAT top_val = top[top_off + top_y_off + top_x_act]; + top_val = (invisibleX || invisibleY) ? 0 : top_val; + + const FLOAT top_df_val = top_df_data[lcl_off_v + b_i]; + const FLOAT scale_val = ratio_data[lcl_off_v + b_i]; + + // scale val is not 0 + FLOAT ratio_dta = (top_df_val * top_val) / scale_val; + // replacing scale with ratio + ratio_data[lcl_off_v + b_i] = ratio_dta; + } + } + + __syncthreads(); + + // caculate bot diff + FLOAT prv_bot_diff[VERT_OUT_PIX][HORIZ_OUT_PIX]; + for(int j = 0; j < VERT_OUT_PIX; ++j) + { + const int v_off_v = (lcl_id1 * VERT_OUT_PIX + j); + const int hstart = y + v_off_v - PAD; + const int hend = min(hstart + KERNEL_SIZE, TOP_HEIGHT + PRE_PAD); + + // value offset, vertical + int lcl_v_off_v = (v_off_v + PAD) * local_data_width; + for(int i = 0; i < HORIZ_OUT_PIX; i++) + { + FLOAT prv_ratio_accum(0); + const int v_off_h = lcl_id0 * HORIZ_OUT_PIX + i; + + const int wstart = x + v_off_h - PAD; + const int wend = min(wstart + KERNEL_SIZE, TOP_WIDTH + PRE_PAD); + const int adj_area_size = (hend - hstart) * (wend - wstart); + + // accum offset, horiz + const int lcl_a_off_h = v_off_h; + // value offset, horiz + const int lcl_v_off_h = lcl_a_off_h + PAD; + for(int k = 0; k < KERNEL_SIZE; k++) + { + for(int l = 0; l < KERNEL_SIZE; l++) + { + prv_ratio_accum += + ratio_data[(v_off_v + k) * local_data_width + lcl_a_off_h + l]; + } + } + + const FLOAT top_df_val = top_df_data[lcl_v_off_v + lcl_v_off_h]; + const unsigned bot_off0 = BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * o + + BOT_STRIDE * (y + v_off_v) + x + v_off_h; + + const unsigned bot_off = (y + v_off_v < BOT_HEIGHT && x + v_off_h < BOT_WIDTH && + b < BATCH_SIZE && o < OUT_CHANNELS) + ? bot_off0 + : BATCH_SIZE * BOT_BATCH_STRIDE - 1; + + const FLOAT bot_dta = (y + v_off_v < BOT_HEIGHT && x + v_off_h < BOT_WIDTH && + b < BATCH_SIZE && o < OUT_CHANNELS) + ? bot[bot_off] + : 0; + const FLOAT adj_ratio = FLOAT(2.f) * alpha * beta / adj_area_size; + const FLOAT prv_accum_ratio = adj_ratio * bot_dta * prv_ratio_accum; + prv_bot_diff[j][i] = prv_exp_scale[j][i] * top_df_val - prv_accum_ratio; + } + } + + for(int j = 0; j < VERT_OUT_PIX; j++) + { + for(int i = 0; i < HORIZ_OUT_PIX; i++) + { + if(bot_y + j < BOT_HEIGHT && bot_x + i < BOT_WIDTH && b < BATCH_SIZE && + o < OUT_CHANNELS) + { + bot_df[BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * o + + BOTDF_STRIDE * (bot_y + j) + bot_x + i] = prv_bot_diff[j][i]; + } + } + } +} + +__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__ + void MIOpenLRNAcrossChannelsBwd1(const FLOAT* top, + const FLOAT* bot, + const FLOAT* top_df, + const FLOAT* scale, + FLOAT* bot_df, + FLOAT ratio, + FLOAT /* alpha */, + FLOAT beta) +{ + const int x = blockIdx.x * GROUP_SIZE_X + threadIdx.x; // channel x + const int y = blockIdx.y * GROUP_SIZE_Y + threadIdx.y; // channel y + const int b = blockIdx.z * group_size_z + threadIdx.z; // batch + FLOAT accum_ratio = 0; + FLOAT top_df_in[KERNEL_SIZE]; + FLOAT scale_in[KERNEL_SIZE]; + FLOAT ratio_dta[KERNEL_SIZE]; + int c_i = 0, c_o = 0; + + for(; c_i < PRE_PAD; c_i++) + { + top_df_in[c_i] = + top_df[TOPDF_BATCH_STRIDE * b + TOPDF_CHANNEL_STRIDE * c_i + TOPDF_STRIDE * y + x]; + scale_in[c_i] = + scale[SCALE_BATCH_STRIDE * b + SCALE_CHANNEL_STRIDE * c_i + SCALE_STRIDE * y + x]; + const FLOAT top_dta = + top[TOP_BATCH_STRIDE * b + TOP_CHANNEL_STRIDE * c_i + TOP_STRIDE * y + x]; + + ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i]; + + if constexpr(low_channel_count) + { + ratio_dta[c_i] = (c_i < OUT_CHANNELS) ? ratio_dta[c_i] : 0; + } + + accum_ratio = accum_ratio + ratio_dta[c_i]; + } + + for(; c_i < KERNEL_SIZE; c_i++, c_o++) + { + top_df_in[c_i] = + top_df[TOPDF_BATCH_STRIDE * b + TOPDF_CHANNEL_STRIDE * c_i + TOPDF_STRIDE * y + x]; + scale_in[c_i] = + scale[SCALE_BATCH_STRIDE * b + SCALE_CHANNEL_STRIDE * c_i + SCALE_STRIDE * y + x]; + const FLOAT top_dta = + top[TOP_BATCH_STRIDE * b + TOP_CHANNEL_STRIDE * c_i + TOP_STRIDE * y + x]; + ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i]; + if constexpr(low_channel_count) + { + ratio_dta[c_i] = (c_i < OUT_CHANNELS) ? ratio_dta[c_i] : 0; + } + + accum_ratio = accum_ratio + ratio_dta[c_i]; + if(!low_channel_count || c_o < N_INPUTS) + { + const FLOAT bot_dta = + bot[BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * c_o + BOT_STRIDE * y + x]; + + const FLOAT prv_scale = scale_in[c_o]; + const FLOAT prv_scale_log = miopen::detail::log(prv_scale); + const FLOAT exp_scale = miopen::detail::exp(-beta * prv_scale_log); + const FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio; + + const FLOAT out_val = top_df_in[c_o] * exp_scale - prv_accum_ratio; + + const int bot_df_off = + BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * c_o + BOTDF_STRIDE * y + x; + + bot_df[bot_df_off] = out_val; + } + } + + for(; c_i < N_INPUTS; c_i++, c_o++) + { + const FLOAT prv_top_df_in = + top_df[TOPDF_BATCH_STRIDE * b + TOPDF_CHANNEL_STRIDE * c_i + TOPDF_STRIDE * y + x]; + const FLOAT prv_scale_in = + scale[SCALE_BATCH_STRIDE * b + SCALE_CHANNEL_STRIDE * c_i + SCALE_STRIDE * y + x]; + const FLOAT top_dta = + top[TOP_BATCH_STRIDE * b + TOP_CHANNEL_STRIDE * c_i + TOP_STRIDE * y + x]; + FLOAT prv_ratio_dta = prv_top_df_in * top_dta / prv_scale_in; + if constexpr(low_channel_count) + { + prv_ratio_dta = (c_i < OUT_CHANNELS) ? prv_ratio_dta : 0; + } + + accum_ratio = accum_ratio + prv_ratio_dta; + accum_ratio = accum_ratio - ratio_dta[0]; + + for(int i = 0; i < KERNEL_SIZE - 1; i++) + { + top_df_in[i] = top_df_in[i + 1]; + scale_in[i] = scale_in[i + 1]; + ratio_dta[i] = ratio_dta[i + 1]; + } + + top_df_in[KERNEL_SIZE - 1] = prv_top_df_in; + scale_in[KERNEL_SIZE - 1] = prv_scale_in; + ratio_dta[KERNEL_SIZE - 1] = prv_ratio_dta; + + if(!low_channel_count || c_o < N_INPUTS) + { + const FLOAT bot_dta = + bot[BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * c_o + BOT_STRIDE * y + x]; + + const FLOAT prv_scale = scale_in[PAD]; + const FLOAT exp_scale = miopen::detail::exp(-beta * miopen::detail::log(prv_scale)); + const FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio; + const FLOAT out_val = top_df_in[PAD] * exp_scale - prv_accum_ratio; + + const int bot_df_off = + BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * c_o + BOTDF_STRIDE * y + x; + + bot_df[bot_df_off] = out_val; + } + } + + for(; c_i < N_INPUTS + PRE_PAD; c_i++, c_o++) + { + accum_ratio = accum_ratio - ratio_dta[0]; + + for(int i = 0; i < KERNEL_SIZE - 1; i++) + { + top_df_in[i] = top_df_in[i + 1]; + scale_in[i] = scale_in[i + 1]; + ratio_dta[i] = ratio_dta[i + 1]; + } + + if(!low_channel_count || c_o < N_INPUTS) + { + const FLOAT bot_dta = + bot[BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * c_o + BOT_STRIDE * y + x]; + + const FLOAT prv_scale = scale_in[PAD]; + const FLOAT exp_scale = miopen::detail::exp(-beta * miopen::detail::log(prv_scale)); + const FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio; + const FLOAT out_val = top_df_in[PAD] * exp_scale - prv_accum_ratio; + + const int bot_df_off = + BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * c_o + BOTDF_STRIDE * y + x; + + bot_df[bot_df_off] = out_val; + } + } +} diff --git a/projects/miopen/src/kernels/MIOpenLRNFwd.cl b/projects/miopen/src/kernels/MIOpenLRNFwd.cl deleted file mode 100644 index 093b86f6dd08..000000000000 --- a/projects/miopen/src/kernels/MIOpenLRNFwd.cl +++ /dev/null @@ -1,713 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#define PPCAT_NX(A, B) A##B -#define PPCAT(A, B) PPCAT_NX(A, B) -#define TWO 2 -#define THREE 3 -#define FOUR 4 -#define EIGHT 8 - -#if MIOPEN_USE_FP16 == 1 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define _FLOAT half -#endif -#if MIOPEN_USE_FP32 == 1 -#define _FLOAT float -#endif - -#define _FLOAT2 PPCAT(_FLOAT, TWO) -#define _FLOAT3 PPCAT(_FLOAT, THREE) -#define _FLOAT4 PPCAT(_FLOAT, FOUR) -#define _FLOAT8 PPCAT(_FLOAT, EIGHT) - -#define DBG_OUT 0 - -#define UNUSED __attribute__((__unused__)) - -#define MLO_LRN_GROUP_SZ2 1 -#define MLO_LRN_STRIDE 1 - -#define MLO_LRN_LEFT_PAD0 (((MLO_LRN_PRE_PAD0 + MLO_READ_UNIT - 1) / MLO_READ_UNIT) * MLO_READ_UNIT) -#define MLO_LRN_RIGHT_SIDE \ - (((MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_PAD0 + MLO_READ_UNIT - 1) / \ - MLO_READ_UNIT) * \ - MLO_READ_UNIT) -#define MLO_LRN_LCL_DATA_WIDTH (MLO_LRN_LEFT_PAD0 + MLO_LRN_RIGHT_SIDE) -#define MLO_LCL_READ4 (MLO_LRN_LCL_DATA_WIDTH / MLO_READ_UNIT) -#define MLO_LRN_LCL_DATA_HEIGHT (MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX + MLO_LRN_KERNEL_SZ - 1) -#define MLO_LRN_GROUP_SZ (MLO_LRN_GROUP_SZ2 * MLO_LRN_GROUP_SZ1 * MLO_LRN_GROUP_SZ0) -// #define MLO_LRN_PREPAD_SZ (MLO_LRN_KERNEL_SZ - 1)/2 - -struct LRNForwardParam -{ - _FLOAT alphaoverarea; - _FLOAT alpha; - _FLOAT beta; - _FLOAT K; -}; - -#include "math_ops.h" -__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2))) -__kernel void -MIOpenLRNWithinChannel_PS(const __global _FLOAT* bot, - __global _FLOAT* top, -#if MLO_LRN_DO_SCALE - __global _FLOAT* scale, -#endif - _FLOAT alphaoverarea, - UNUSED _FLOAT alpha, - _FLOAT beta, - _FLOAT K) -{ - // IT's taken from POOLING AVE with stride = 1' - __local _FLOAT bot_data[MLO_LRN_LCL_DATA_WIDTH * MLO_LRN_LCL_DATA_HEIGHT]; - int x = get_group_id(0) * MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX; - int y = get_group_id(1) * MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX; - int lcl_id0 = get_local_id(0); - int lcl_id1 = get_local_id(1); - int ob = get_global_id(2); // output * batch_sz - int o = iDiv_legacy(ob, MLO_LRN_BATCH_SZ); - int b = iMod(ob, o, MLO_LRN_BATCH_SZ); - int bot_x = x; - int bot_y = y; - int bot_off = b * MLO_LRN_BOT_BATCH_STRIDE + o * MLO_LRN_BOT_CHANNEL_STRIDE; - - // load tile - for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1) - { - int bot_y_act = bot_y + b_j - MLO_LRN_PRE_PAD1; - - bool invisibleY = (bot_y_act < 0) || (bot_y_act >= MLO_LRN_BOT_HEIGHT); - - int bot_y_off = bot_y_act * MLO_LRN_BOT_STRIDE; - - int lcl_off_v = mul24(b_j, (int)MLO_LRN_LCL_DATA_WIDTH); - - for(int b_i = lcl_id0; b_i < MLO_LCL_READ4; b_i += MLO_LRN_GROUP_SZ0) - { - - int bot_x_act = bot_x + (b_i * MLO_READ_UNIT) - MLO_LRN_LEFT_PAD0; - - bool invisibleX; - for(int i = 0; i < MLO_READ_UNIT; ++i) - { - - int bot_off_x = bot_off + bot_y_off + bot_x_act + i; - - invisibleX = (bot_x_act + i < 0) || (bot_x_act + i >= MLO_LRN_BOT_WIDTH); - - bot_off_x = (invisibleX || invisibleY) ? 0 : bot_off_x; - - _FLOAT bot_val = bot[bot_off_x]; - - bot_val = (invisibleX || invisibleY) ? 0 : bot_val; - - bot_data[lcl_off_v + (b_i * MLO_READ_UNIT) + i] = bot_val; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - _FLOAT partial_sum_x[MLO_LRN_N_HORIZ_OUT_PIX - 1]; // horizontal partial sum -#endif -#if MLO_LRN_N_VERT_OUT_PIX > 1 - _FLOAT partial_sum_xy[MLO_LRN_N_VERT_OUT_PIX - 1] - [MLO_LRN_N_HORIZ_OUT_PIX]; // horizontal-vertical partial sums. -#endif - _FLOAT accum[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX]; // accumulator - - int top_y = mad24(lcl_id1, (int)MLO_LRN_N_VERT_OUT_PIX, y); - int top_x = mad24(lcl_id0, (int)MLO_LRN_N_HORIZ_OUT_PIX, x); - - int lcl_y = mul24(lcl_id1, (int)MLO_LRN_N_VERT_OUT_PIX); - int lcl_x = - mad24(lcl_id0, (int)(MLO_LRN_N_HORIZ_OUT_PIX), (int)(MLO_LRN_LEFT_PAD0 - MLO_LRN_PRE_PAD0)); - int lcl_off = mad24(lcl_y, MLO_LRN_LCL_DATA_WIDTH, lcl_x); - - for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j) - { - for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; ++i) - { - accum[j][i] = 0; - } - } -#if MLO_LRN_N_VERT_OUT_PIX > 1 - for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX - 1; ++j) - { - for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; ++i) - { - partial_sum_xy[j][i] = 0; - } - } -#endif - - // running window summation - _FLOAT mov_accum; - int jj = 0; - int ii = 0; - - // first to get vertica partial sums - -#if MLO_LRN_N_VERT_OUT_PIX > 1 - for(; jj < (int)(MLO_LRN_N_VERT_OUT_PIX - 1); ++jj) - { - for(ii = 0; ii < (int)(MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - - _FLOAT accum_tmp = bot_val * bot_val; - -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - // save horizontal partial sums - partial_sum_x[ii] = accum_tmp; -#endif - // accumulate in vert-horizontal(0) - partial_sum_xy[jj][0] += accum_tmp; - } - - for(; ii < (int)MLO_LRN_KERNEL_SZ0; ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; - // accumulate in vert horizontal(0) - partial_sum_xy[jj][0] += accum_tmp; - } - - // running horizontal window - - for(; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; - // calculate all vertical-horizontal partial sums - partial_sum_xy[jj][ii - MLO_LRN_KERNEL_SZ0 + 1] = - partial_sum_xy[jj][ii - MLO_LRN_KERNEL_SZ0] + - (accum_tmp -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - - partial_sum_x[ii - MLO_LRN_KERNEL_SZ0] -#endif - ); - } - - // put into accumulator[0][i] - // whatever has been accumulated so far - for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; ++i) - { - accum[0][i] += partial_sum_xy[jj][i]; - } - } -#endif - - // calculate row 0 accumulators - for(; jj < (int)MLO_LRN_KERNEL_SZ1; ++jj) - { - mov_accum = 0; - - for(ii = 0; ii < (int)(MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - partial_sum_x[ii] = accum_tmp; -#endif - mov_accum += accum_tmp; - } - - for(; ii < (int)MLO_LRN_KERNEL_SZ0; ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; - mov_accum += accum_tmp; - } - - accum[0][0] += mov_accum; - // running horizontal window - - for(; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; - // running horizontal window - mov_accum += (accum_tmp -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - - partial_sum_x[ii - MLO_LRN_KERNEL_SZ0] -#endif - ); - accum[0][ii - MLO_LRN_KERNEL_SZ0 + 1] += mov_accum; - } - } - - // accumulate all other rows besides 0 - for(; jj < (int)(MLO_LRN_KERNEL_SZ1 + MLO_LRN_N_VERT_OUT_PIX - 1); ++jj) - { - // first running horizontal winodw as before - mov_accum = 0; - for(ii = 0; ii < (int)(MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - partial_sum_x[ii] = accum_tmp; -#endif - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] += accum_tmp; - } - for(; ii < (int)MLO_LRN_KERNEL_SZ0; ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] += accum_tmp; - } - // running horizontal window - - int ii1 = ii; - for(; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii]; - _FLOAT accum_tmp = bot_val * bot_val; - // - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] = - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0] + accum_tmp; -#if MLO_LRN_N_HORIZ_OUT_PIX > 1 - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] -= - partial_sum_x[ii - MLO_LRN_KERNEL_SZ0]; -#endif - } - - // finally running vertical window - - for(ii = ii1; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii) - { - - // finish horizontal summation - // add/substarct vertical patial sum - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] += - accum[jj - MLO_LRN_KERNEL_SZ1][ii - MLO_LRN_KERNEL_SZ0 + 1]; -#if MLO_LRN_N_VERT_OUT_PIX > 1 - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] -= - partial_sum_xy[jj - MLO_LRN_KERNEL_SZ1][ii - MLO_LRN_KERNEL_SZ0 + 1]; -#endif - } -#if MLO_LRN_N_VERT_OUT_PIX > 1 - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] -= partial_sum_xy[jj - MLO_LRN_KERNEL_SZ1][0]; -#endif - accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] += accum[jj - MLO_LRN_KERNEL_SZ1][0]; - } - - // normalization - _FLOAT prv_scale[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX]; - _FLOAT adj_alphaoverarea = alphaoverarea; - for(int k = 0; k < MLO_LRN_N_VERT_OUT_PIX; k++) - { - - // int hstart = y + lcl_id1 * MLO_LRN_N_VERT_OUT_PIX + k - - // MLO_LRN_PAD1; - // int hend = min(hstart + MLO_LRN_KERNEL_SZ, MLO_LRN_BOT_HEIGHT + - // MLO_LRN_PAD1); - - for(int l = 0; l < MLO_LRN_N_HORIZ_OUT_PIX; l++) - { - - // int wstart = x + lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + l - - // MLO_LRN_PAD0; - // int wend = min(wstart + MLO_LRN_KERNEL_SZ, MLO_LRN_BOT_WIDTH - //+ - // MLO_LRN_PAD0); - // int adj_area_size = (hend - hstart) * (wend - wstart); - // adj_alphaoverarea = alpha / adj_area_size; - - prv_scale[k][l] = K + accum[k][l] * adj_alphaoverarea; - } - } - - int top_off = b * MLO_LRN_TOP_BATCH_STRIDE + o * MLO_LRN_TOP_CHANNEL_STRIDE + - top_y * MLO_LRN_TOP_STRIDE + top_x; -#if MLO_LRN_DO_SCALE - int scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + o * MLO_LRN_SCALE_CHANNEL_STRIDE + - top_y * MLO_LRN_SCALE_STRIDE + top_x; -#endif - - // final output - - for(int k = 0; k < MLO_LRN_N_VERT_OUT_PIX -#if MLO_OUT_VERT_ALIGNED == 0 - && (top_y + k < MLO_LRN_TOP_HEIGHT) -#endif - ; - k++) - { - for(int l = 0; l < MLO_LRN_N_HORIZ_OUT_PIX -#if MLO_OUT_HORIZ_ALIGNED == 0 - && (top_x + l < MLO_LRN_TOP_WIDTH) -#endif - ; - l++) - { - _FLOAT s; - s = exp((_FLOAT)-beta * log(prv_scale[k][l])); - // s = pow(prv_scale[k][l], -beta); - _FLOAT bot_val = bot_data[lcl_off + mad24((k + MLO_LRN_PRE_PAD1), - (int)MLO_LRN_LCL_DATA_WIDTH, - (l + MLO_LRN_PRE_PAD0))]; -#if MLO_LRN_DO_SCALE - scale[scale_off + k * MLO_LRN_SCALE_STRIDE + l] = prv_scale[k][l]; -#endif - top[top_off + k * MLO_LRN_TOP_STRIDE + l] = bot_val * s; - } - } -} - -#if(MLO_LRN_N_INPUTS < MLO_LRN_KERNEL_SZ) -#define MLO_LOW_CHNL_COUNT 1 -#else -#define MLO_LOW_CHNL_COUNT 0 -#endif -__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2))) -__kernel void -MIOpenLRNAcrossChannels4(const __global _FLOAT* bottom, - __global _FLOAT* top, -#if MLO_LRN_DO_SCALE - __global _FLOAT* scale, -#endif - _FLOAT alphaoverarea, - UNUSED _FLOAT alpha, - _FLOAT beta, - _FLOAT K) -{ - int pix_id = get_global_id(0); // - int b = get_global_id(2); // batch - MLO_READ_TYPE accum = 0; - MLO_READ_TYPE bot_in2[MLO_LRN_KERNEL_SZ]; - MLO_READ_TYPE bot_in[MLO_LRN_KERNEL_SZ]; - int c_i = 0, c_o = 0; - for(int i = 0; i < MLO_LRN_KERNEL_SZ; ++i) - { - bot_in2[i] = 0; - bot_in[i] = 0; - } - - int top_off = 0; -#if MLO_LRN_DO_SCALE - int scale_off; -#endif - - for(c_i = 0; c_i < MLO_LRN_PAD; c_i++) - { - MLO_READ_TYPE prv_in; - prv_in = 0; - -#if MLO_LOW_CHNL_COUNT == 1 - if(c_i < MLO_LRN_N_INPUTS) -#endif - { -#if MLO_C1x1_PIXLEFT > 0 - // if the last one - if(pix_id == MLO_MAP_SZ4 - 1) - { - - for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j) - { - ((_FLOAT*)&prv_in)[j] = - bottom[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_i + - (pix_id * MLO_READ_UNIT) + j]; - } - } - else -#endif - { - prv_in = *(__global MLO_READ_TYPE*)&bottom[MLO_LRN_BOT_BATCH_STRIDE * b + - MLO_LRN_BOT_CHANNEL_STRIDE * c_i + - (pix_id * MLO_READ_UNIT)]; - } - } - - bot_in2[c_i] = prv_in * prv_in; - bot_in[c_i] = prv_in; - accum = accum + bot_in2[c_i]; - // fma(bot_in2[c_i + MLO_LRN_PAD], bot_in2[c_i + MLO_LRN_PAD], - // accum); - } - - for(; c_i < MLO_LRN_KERNEL_SZ; c_i++, c_o++) - { - MLO_READ_TYPE prv_in; - prv_in = 0; - -#if MLO_LOW_CHNL_COUNT == 1 - if(c_i < MLO_LRN_N_INPUTS) -#endif - { - -#if MLO_C1x1_PIXLEFT > 0 - // if the last one - if(pix_id == MLO_MAP_SZ4 - 1) - { - - for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j) - { - ((_FLOAT*)&prv_in)[j] = - bottom[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_i + - (pix_id * MLO_READ_UNIT) + j]; - } - } - else -#endif - { - prv_in = *(__global MLO_READ_TYPE*)&bottom[MLO_LRN_BOT_BATCH_STRIDE * b + - MLO_LRN_BOT_CHANNEL_STRIDE * c_i + - (pix_id * MLO_READ_UNIT)]; - } - } - - bot_in2[c_i] = prv_in * prv_in; - bot_in[c_i] = prv_in; - accum = accum + bot_in2[c_i]; - - top_off = b * MLO_LRN_TOP_BATCH_STRIDE + c_o * MLO_LRN_TOP_CHANNEL_STRIDE + - (pix_id * MLO_READ_UNIT); -#if MLO_LRN_DO_SCALE - scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + c_o * MLO_LRN_SCALE_CHANNEL_STRIDE + - (pix_id * MLO_READ_UNIT); -#endif - MLO_READ_TYPE prv_scale = ((MLO_READ_TYPE)K + accum * (MLO_READ_TYPE)alphaoverarea); - // fma(accum,alphaoverarea, (_FLOAT)1.f); - - MLO_READ_TYPE exp_scale = exp((MLO_READ_TYPE)-beta * log(prv_scale)); - // pow(prv_scale,-beta); - // bug - // MLO_READ_TYPE prv_out = sqrt(bot_in2[c_o]); - MLO_READ_TYPE prv_out = bot_in[c_o]; - MLO_READ_TYPE out_val = prv_out * exp_scale; -#if MLO_LOW_CHNL_COUNT == 1 - if(c_o < MLO_LRN_N_OUTPUTS) -#endif - { - -#if MLO_C1x1_PIXLEFT > 0 - - // if the last one - if(pix_id == MLO_MAP_SZ4 - 1) - { - for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j) - { - top[top_off + j] = ((_FLOAT*)&out_val)[j]; -#if DBG_OUT - printf("K:o0: %d %f %f %f %f %f\n", - top_off + j, - top[top_off + j], - ((_FLOAT*)&prv_out)[j], - ((_FLOAT*)&exp_scale)[j], - ((_FLOAT*)&prv_scale)[j], - ((_FLOAT*)&accum)[j]); -#endif - -#if MLO_LRN_DO_SCALE - scale[scale_off + j] = ((_FLOAT*)&prv_scale)[j]; -#endif - } - } - else -#endif - { - - *((__global MLO_READ_TYPE*)&top[top_off]) = out_val; -#if MLO_LRN_DO_SCALE - *((__global MLO_READ_TYPE*)&scale[scale_off]) = prv_scale; -#endif - } - } - } - - for(; c_i < MLO_LRN_N_INPUTS; c_i++, c_o++) - { - - MLO_READ_TYPE prv_in; - prv_in = 0; - -#if MLO_C1x1_PIXLEFT > 0 - // if the last one - if(pix_id == MLO_MAP_SZ4 - 1) - { - - for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j) - { - ((_FLOAT*)&prv_in)[j] = - bottom[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_i + - (pix_id * MLO_READ_UNIT) + j]; - } - } - else -#endif - { - prv_in = *(__global MLO_READ_TYPE*)&bottom[MLO_LRN_BOT_BATCH_STRIDE * b + - MLO_LRN_BOT_CHANNEL_STRIDE * c_i + - (pix_id * MLO_READ_UNIT)]; - } - - MLO_READ_TYPE prv_bot_in2 = prv_in * prv_in; - accum = accum + prv_bot_in2; - - accum = accum - bot_in2[0]; - // fma(-bot_in2[0], bot_in2[0], accum); - - for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++) - { - bot_in2[i] = bot_in2[i + 1]; - bot_in[i] = bot_in[i + 1]; - } - - bot_in2[MLO_LRN_KERNEL_SZ - 1] = prv_bot_in2; - bot_in[MLO_LRN_KERNEL_SZ - 1] = prv_in; - - top_off = b * MLO_LRN_TOP_BATCH_STRIDE + c_o * MLO_LRN_TOP_CHANNEL_STRIDE + - (pix_id * MLO_READ_UNIT); -#if MLO_LRN_DO_SCALE - scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + c_o * MLO_LRN_SCALE_CHANNEL_STRIDE + - (pix_id * MLO_READ_UNIT); -#endif - MLO_READ_TYPE prv_scale = ((MLO_READ_TYPE)K + accum * (MLO_READ_TYPE)alphaoverarea); - // fma(accum,alphaoverarea, (_FLOAT)1.f); - - MLO_READ_TYPE exp_scale = exp((MLO_READ_TYPE)-beta * log(prv_scale)); - // pow(prv_scale,-beta); - // bug - // MLO_READ_TYPE prv_out = sqrt(bot_in2[MLO_LRN_PRE_PAD]); - MLO_READ_TYPE prv_out = bot_in[MLO_LRN_PRE_PAD]; - MLO_READ_TYPE out_val = prv_out * exp_scale; - -#if MLO_LOW_CHNL_COUNT == 1 - if(c_o < MLO_LRN_N_OUTPUTS) -#endif - { - -#if MLO_C1x1_PIXLEFT > 0 - - // if the last one - if(pix_id == MLO_MAP_SZ4 - 1) - { - for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j) - { - top[top_off + j] = ((_FLOAT*)&out_val)[j]; -#if DBG_OUT - printf("K:o1: %d %f %f %f\n", - top_off + j, - top[top_off + j], - ((_FLOAT*)&prv_out)[j], - ((_FLOAT*)&exp_scale)[j]); -#endif - -#if MLO_LRN_DO_SCALE - scale[scale_off + j] = ((_FLOAT*)&prv_scale)[j]; -#endif - } - } - else -#endif - { - - *((__global MLO_READ_TYPE*)&top[top_off]) = out_val; -#if MLO_LRN_DO_SCALE - *((__global MLO_READ_TYPE*)&scale[scale_off]) = prv_scale; -#endif - } - } - } - - for(; c_i < MLO_LRN_N_INPUTS + MLO_LRN_PAD; c_i++, c_o++) - { - - accum = accum - bot_in2[0]; - // fma(-bot_in2[0], bot_in2[0], accum); - - for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++) - { - bot_in2[i] = bot_in2[i + 1]; - bot_in[i] = bot_in[i + 1]; - } - - top_off = b * MLO_LRN_TOP_BATCH_STRIDE + c_o * MLO_LRN_TOP_CHANNEL_STRIDE + - (pix_id * MLO_READ_UNIT); -#if MLO_LRN_DO_SCALE - scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + c_o * MLO_LRN_SCALE_CHANNEL_STRIDE + - (pix_id * MLO_READ_UNIT); -#endif - MLO_READ_TYPE prv_scale = ((MLO_READ_TYPE)K + accum * (MLO_READ_TYPE)alphaoverarea); - // fma(accum,alphaoverarea, (_FLOAT)1.f); - - MLO_READ_TYPE exp_scale = exp((MLO_READ_TYPE)-beta * log(prv_scale)); - // pow(prv_scale,-beta); - // bug - // MLO_READ_TYPE prv_out = sqrt(bot_in2[MLO_LRN_PRE_PAD]); - MLO_READ_TYPE prv_out = bot_in[MLO_LRN_PRE_PAD]; - - MLO_READ_TYPE out_val = prv_out * exp_scale; -#if MLO_LOW_CHNL_COUNT == 1 - if(c_o < MLO_LRN_N_OUTPUTS) -#endif - { - -#if MLO_C1x1_PIXLEFT > 0 - - // if the last one - if(pix_id == MLO_MAP_SZ4 - 1) - { - for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j) - { - top[top_off + j] = ((_FLOAT*)&out_val)[j]; -#if DBG_OUT - printf("K:o2: %d %f %f %f\n", - top_off + j, - top[top_off + j], - ((_FLOAT*)&prv_out)[j], - ((_FLOAT*)&exp_scale)[j]); -#endif - -#if MLO_LRN_DO_SCALE - scale[scale_off + j] = ((_FLOAT*)&prv_scale)[j]; -#endif - } - } - else -#endif - { - - *((__global MLO_READ_TYPE*)&top[top_off]) = out_val; -#if MLO_LRN_DO_SCALE - *((__global MLO_READ_TYPE*)&scale[scale_off]) = prv_scale; -#endif - } - } - } -} diff --git a/projects/miopen/src/kernels/MIOpenLRNFwd.cpp b/projects/miopen/src/kernels/MIOpenLRNFwd.cpp new file mode 100644 index 000000000000..084745d7a76d --- /dev/null +++ b/projects/miopen/src/kernels/MIOpenLRNFwd.cpp @@ -0,0 +1,545 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef MIOPEN_HIP_RUNTIME_COMPILE +#include +#include +#endif + +#include "miopen_math.hpp" +#include "hip_math_ops.hpp" + +using ReadType = typename miopen::mapped_vector_type::type; + +constexpr unsigned group_size_z = 1u; +constexpr unsigned left_pad0 = ((PRE_PAD0 + READ_UNIT - 1) / READ_UNIT) * READ_UNIT; +constexpr unsigned right_side = + ((GROUP_SIZE_X * HORIZ_OUT_PIX + PAD0 + READ_UNIT - 1) / READ_UNIT) * READ_UNIT; +constexpr unsigned data_width = left_pad0 + right_side; +constexpr unsigned local_read4 = data_width / READ_UNIT; +constexpr unsigned local_data_height = GROUP_SIZE_Y * VERT_OUT_PIX + KERNEL_SIZE - 1; +constexpr bool low_channel_count = N_INPUTS < KERNEL_SIZE; + +__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__ + void MIOpenLRNWithinChannel_PS(const FLOAT* bot, + FLOAT* top, +#if DO_SCALE + FLOAT* scale, +#endif + FLOAT alphaoverarea, + FLOAT beta, + FLOAT K) +{ + // Taken from POOLING AVE with stride = 1 + __shared__ FLOAT bot_data[data_width * local_data_height]; + const int x = blockIdx.x * GROUP_SIZE_X * HORIZ_OUT_PIX; + const int y = blockIdx.y * GROUP_SIZE_Y * VERT_OUT_PIX; + const int lcl_id0 = threadIdx.x; + const int lcl_id1 = threadIdx.y; + const int ob = blockIdx.z * group_size_z + threadIdx.z; // output * batch_sz + const int o = iDiv(ob, BATCH_SIZE); + const int b = iMod(ob, o, BATCH_SIZE); + const int bot_x = x; + const int bot_y = y; + const int bot_off = b * BOT_BATCH_STRIDE + o * BOT_CHANNEL_STRIDE; + + // load tile + for(int b_j = lcl_id1; b_j < local_data_height; b_j += GROUP_SIZE_Y) + { + const int bot_y_act = bot_y + b_j - PRE_PAD1; + const bool invisibleY = (bot_y_act < 0) || (bot_y_act >= BOT_HEIGHT); + const int bot_y_off = bot_y_act * BOT_STRIDE; + const int lcl_off_v = __mul24(b_j, (int)data_width); + + for(int b_i = lcl_id0; b_i < local_read4; b_i += GROUP_SIZE_X) + { + const int bot_x_act = bot_x + (b_i * READ_UNIT) - left_pad0; + for(int i = 0; i < READ_UNIT; ++i) + { + int bot_off_x = bot_off + bot_y_off + bot_x_act + i; + const bool invisibleX = (bot_x_act + i < 0) || (bot_x_act + i >= BOT_WIDTH); + bot_off_x = (invisibleX || invisibleY) ? 0 : bot_off_x; + + FLOAT bot_val = bot[bot_off_x]; + bot_val = (invisibleX || invisibleY) ? 0 : bot_val; + + bot_data[lcl_off_v + (b_i * READ_UNIT) + i] = bot_val; + } + } + } + + __syncthreads(); +#if HORIZ_OUT_PIX > 1 + FLOAT partial_sum_x[HORIZ_OUT_PIX - 1]; // horizontal partial sum +#endif +#if VERT_OUT_PIX > 1 + FLOAT partial_sum_xy[VERT_OUT_PIX - 1][HORIZ_OUT_PIX]; // horizontal-vertical partial sums. +#endif + FLOAT accum[VERT_OUT_PIX][HORIZ_OUT_PIX]; // accumulator + + const int top_y = __mul24(lcl_id1, VERT_OUT_PIX) + y; + const int top_x = __mul24(lcl_id0, HORIZ_OUT_PIX) + x; + + const int lcl_y = __mul24(lcl_id1, VERT_OUT_PIX); + const int lcl_x = __mul24(lcl_id0, HORIZ_OUT_PIX) + (left_pad0 - PRE_PAD0); + const int lcl_off = __mul24(lcl_y, data_width) + lcl_x; + + for(int j = 0; j < VERT_OUT_PIX; ++j) + { + for(int i = 0; i < HORIZ_OUT_PIX; ++i) + { + accum[j][i] = 0; + } + } + +#if VERT_OUT_PIX > 1 + for(int j = 0; j < VERT_OUT_PIX - 1; ++j) + { + for(int i = 0; i < HORIZ_OUT_PIX; ++i) + { + partial_sum_xy[j][i] = 0; + } + } +#endif + + // running window summation + int jj = 0; + int ii = 0; + + // first to get vertical partial sums +#if VERT_OUT_PIX > 1 + for(; jj < (VERT_OUT_PIX - 1); ++jj) + { + for(ii = 0; ii < (HORIZ_OUT_PIX - 1); ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + +#if HORIZ_OUT_PIX > 1 + // save horizontal partial sums + partial_sum_x[ii] = accum_tmp; +#endif + // accumulate in vert-horizontal(0) + partial_sum_xy[jj][0] += accum_tmp; + } + + for(; ii < KERNEL_SIZE0; ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + // accumulate in vert horizontal(0) + partial_sum_xy[jj][0] += accum_tmp; + } + + // running horizontal window + for(; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + // calculate all vertical-horizontal partial sums + partial_sum_xy[jj][ii - KERNEL_SIZE0 + 1] = + partial_sum_xy[jj][ii - KERNEL_SIZE0] + (accum_tmp +#if HORIZ_OUT_PIX > 1 + - partial_sum_x[ii - KERNEL_SIZE0] +#endif + ); + } + + // put into accumulator[0][i] + // whatever has been accumulated so far + for(int i = 0; i < HORIZ_OUT_PIX; ++i) + { + accum[0][i] += partial_sum_xy[jj][i]; + } + } +#endif + + FLOAT mov_accum; + // calculate row 0 accumulators + for(; jj < KERNEL_SIZE1; ++jj) + { + mov_accum = 0; + + for(ii = 0; ii < (HORIZ_OUT_PIX - 1); ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; +#if HORIZ_OUT_PIX > 1 + partial_sum_x[ii] = accum_tmp; +#endif + mov_accum += accum_tmp; + } + + for(; ii < KERNEL_SIZE0; ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + mov_accum += accum_tmp; + } + + accum[0][0] += mov_accum; + // running horizontal window + for(; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + // running horizontal window + mov_accum += (accum_tmp +#if HORIZ_OUT_PIX > 1 + - partial_sum_x[ii - KERNEL_SIZE0] +#endif + ); + accum[0][ii - KERNEL_SIZE0 + 1] += mov_accum; + } + } + + // accumulate all other rows besides 0 + for(; jj < (KERNEL_SIZE1 + VERT_OUT_PIX - 1); ++jj) + { + // first running horizontal winodw as before + mov_accum = 0; + for(ii = 0; ii < (HORIZ_OUT_PIX - 1); ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; +#if HORIZ_OUT_PIX > 1 + partial_sum_x[ii] = accum_tmp; +#endif + accum[jj - KERNEL_SIZE1 + 1][0] += accum_tmp; + } + for(; ii < KERNEL_SIZE0; ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + accum[jj - KERNEL_SIZE1 + 1][0] += accum_tmp; + } + // running horizontal window + int ii1 = ii; + for(; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii) + { + const FLOAT bot_val = bot_data[lcl_off + jj * data_width + ii]; + const FLOAT accum_tmp = bot_val * bot_val; + + accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] = + accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0] + accum_tmp; +#if HORIZ_OUT_PIX > 1 + accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] -= partial_sum_x[ii - KERNEL_SIZE0]; +#endif + } + + // finally running vertical window + for(ii = ii1; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii) + { + // finish horizontal summation + // add/substarct vertical patial sum + accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] += + accum[jj - KERNEL_SIZE1][ii - KERNEL_SIZE0 + 1]; +#if VERT_OUT_PIX > 1 + accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] -= + partial_sum_xy[jj - KERNEL_SIZE1][ii - KERNEL_SIZE0 + 1]; +#endif + } +#if VERT_OUT_PIX > 1 + accum[jj - KERNEL_SIZE1 + 1][0] -= partial_sum_xy[jj - KERNEL_SIZE1][0]; +#endif + accum[jj - KERNEL_SIZE1 + 1][0] += accum[jj - KERNEL_SIZE1][0]; + } + + // normalization + FLOAT prv_scale[VERT_OUT_PIX][HORIZ_OUT_PIX]; + const FLOAT adj_alphaoverarea = alphaoverarea; + for(int k = 0; k < VERT_OUT_PIX; k++) + { + for(int l = 0; l < HORIZ_OUT_PIX; l++) + { + prv_scale[k][l] = K + accum[k][l] * adj_alphaoverarea; + } + } + + const int top_off = b * TOP_BATCH_STRIDE + o * TOP_CHANNEL_STRIDE + top_y * TOP_STRIDE + top_x; +#if DO_SCALE + int scale_off = + b * SCALE_BATCH_STRIDE + o * SCALE_CHANNEL_STRIDE + top_y * SCALE_STRIDE + top_x; +#endif + + /* + The HIP compiler doesn't automatically unroll this nested loop so we need to + use pragma unroll to encourage that for better performance. Additionally, when access is not + aligned in the horizontal or vertical access we lift the if condition out of the loop + termination if the height/width is a multiple of the vert/horiz pixels, enabling SIMD + vectorizatiom of the loop body. + These optimizations were being done automatically by the OpenCL compiler prior to porting to + HIP. + */ + constexpr bool is_vert_aligned = (VERT_ALIGNED == 1); + constexpr bool is_horiz_aligned = (HORIZ_ALIGNED == 1); + constexpr bool sink_vert_align_check = !is_vert_aligned && ((TOP_HEIGHT % VERT_OUT_PIX) != 0); + constexpr bool sink_horiz_align_check = !is_horiz_aligned && ((TOP_WIDTH % HORIZ_OUT_PIX) != 0); + constexpr bool lift_vert_align_check = !is_vert_aligned && ((TOP_HEIGHT % VERT_OUT_PIX) == 0); + constexpr bool lift_horiz_align_check = !is_horiz_aligned && ((TOP_WIDTH % HORIZ_OUT_PIX) == 0); + + // final output + if(!lift_vert_align_check || (lift_vert_align_check && (top_y < TOP_HEIGHT))) + { +#pragma unroll VERT_OUT_PIX + for(int k = 0; k < VERT_OUT_PIX; k++) + { + if(sink_vert_align_check && !(top_y + k < TOP_HEIGHT)) + { + break; + } + + if(!lift_horiz_align_check || (lift_horiz_align_check && (top_x < TOP_WIDTH))) + { +#pragma unroll HORIZ_OUT_PIX + for(int l = 0; l < HORIZ_OUT_PIX; l++) + { + if(sink_horiz_align_check && !(top_x + l < TOP_WIDTH)) + { + break; + } + const FLOAT s = + miopen::detail::exp(FLOAT(-beta) * miopen::detail::log(prv_scale[k][l])); + int offset = __mul24((k + PRE_PAD1), data_width) + (l + PRE_PAD0); + FLOAT bot_val = bot_data[lcl_off + offset]; +#if DO_SCALE + scale[scale_off + k * SCALE_STRIDE + l] = prv_scale[k][l]; +#endif + top[top_off + k * TOP_STRIDE + l] = bot_val * s; + } + } + } + } +} + +__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__ + void MIOpenLRNAcrossChannels4(const FLOAT* bottom, + FLOAT* top, +#if DO_SCALE + FLOAT* scale, +#endif + FLOAT alphaoverarea, + FLOAT beta, + FLOAT K) +{ + const int pix_id = blockIdx.x * GROUP_SIZE_X + threadIdx.x; + const int batch = blockIdx.z * group_size_z + threadIdx.z; + ReadType accum(0); + ReadType bot_in2[KERNEL_SIZE]; + ReadType bot_in[KERNEL_SIZE]; + for(int i = 0; i < KERNEL_SIZE; ++i) + { + bot_in2[i] = ReadType(0); + bot_in[i] = ReadType(0); + } + + int top_off = 0; +#if DO_SCALE + int scale_off; +#endif + + int c_i = 0, c_o = 0; // accumulated throughout the kernel + for(; c_i < PAD; c_i++) + { + ReadType prv_in(0); + if(!low_channel_count || c_i < N_INPUTS) + { + const auto offset = + BOT_BATCH_STRIDE * batch + BOT_CHANNEL_STRIDE * c_i + (pix_id * READ_UNIT); + // if the last one + if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1)) + { + for(int j = 0; j < C1x1_PIXLEFT; ++j) + { + FLOAT* prv_in_as_scalar = reinterpret_cast(&prv_in); + prv_in_as_scalar[j] = bottom[offset + j]; + } + } + else + { + const FLOAT* bottom_offset = bottom + offset; + prv_in = *reinterpret_cast(bottom_offset); + } + } + + bot_in2[c_i] = prv_in * prv_in; + bot_in[c_i] = prv_in; + accum = accum + bot_in2[c_i]; + } + + for(; c_i < KERNEL_SIZE; c_i++, c_o++) + { + ReadType prv_in(0); + if(!low_channel_count || c_i < N_INPUTS) + { + const auto offset = + BOT_BATCH_STRIDE * batch + BOT_CHANNEL_STRIDE * c_i + (pix_id * READ_UNIT); + if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1)) + { + for(int j = 0; j < C1x1_PIXLEFT; ++j) + { + FLOAT* prv_in_as_scalar = reinterpret_cast(&prv_in); + prv_in_as_scalar[j] = bottom[offset + j]; + } + } + else + { + const FLOAT* bottom_offset = bottom + offset; + prv_in = *reinterpret_cast(bottom_offset); + } + } + + bot_in2[c_i] = prv_in * prv_in; + bot_in[c_i] = prv_in; + accum = accum + bot_in2[c_i]; + + top_off = batch * TOP_BATCH_STRIDE + c_o * TOP_CHANNEL_STRIDE + (pix_id * READ_UNIT); +#if DO_SCALE + scale_off = batch * SCALE_BATCH_STRIDE + c_o * SCALE_CHANNEL_STRIDE + (pix_id * READ_UNIT); +#endif + const ReadType prv_scale = (ReadType(K) + accum * ReadType(alphaoverarea)); + const ReadType prv_scale_log = miopen::log(prv_scale); + const ReadType exp_scale = miopen::exp(ReadType(-beta) * prv_scale_log); + const ReadType prv_out = bot_in[c_o]; + const ReadType out_val = prv_out * exp_scale; + if(!low_channel_count || c_o < N_INPUTS) + { + // if the last one + if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1)) + { + for(int j = 0; j < C1x1_PIXLEFT; ++j) + { + const FLOAT* out_val_as_scalar = reinterpret_cast(&out_val); + top[top_off + j] = out_val_as_scalar[j]; +#if DO_SCALE + const FLOAT* prv_scale_as_scalar = reinterpret_cast(&prv_scale); + scale[scale_off + j] = prv_scale_as_scalar[j]; +#endif + } + } + else + { + FLOAT* top_offset = top + top_off; + *reinterpret_cast(top_offset) = out_val; + +#if DO_SCALE + FLOAT* scale_offset = scale + top_off; + *reinterpret_cast(scale_offset) = prv_scale; +#endif + } + } + } + + for(; c_i < N_INPUTS; c_i++, c_o++) + { + ReadType prv_in(0); + auto offset = BOT_BATCH_STRIDE * batch + BOT_CHANNEL_STRIDE * c_i + (pix_id * READ_UNIT); + // if the last one + if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1)) + { + for(int j = 0; j < C1x1_PIXLEFT; ++j) + { + FLOAT* prv_in_as_scalar = reinterpret_cast(&prv_in); + prv_in_as_scalar[j] = bottom[offset + j]; + } + } + else + { + const FLOAT* bottom_offset = bottom + offset; + prv_in = *reinterpret_cast(bottom_offset); + } + + const ReadType prv_bot_in2 = prv_in * prv_in; + accum = accum + prv_bot_in2; + accum = accum - bot_in2[0]; + for(int i = 0; i < KERNEL_SIZE - 1; i++) + { + bot_in2[i] = bot_in2[i + 1]; + bot_in[i] = bot_in[i + 1]; + } + + bot_in2[KERNEL_SIZE - 1] = prv_bot_in2; + bot_in[KERNEL_SIZE - 1] = prv_in; + + top_off = batch * TOP_BATCH_STRIDE + c_o * TOP_CHANNEL_STRIDE + (pix_id * READ_UNIT); +#if DO_SCALE + scale_off = batch * SCALE_BATCH_STRIDE + c_o * SCALE_CHANNEL_STRIDE + (pix_id * READ_UNIT); +#endif + const ReadType prv_scale = (ReadType(K) + accum * ReadType(alphaoverarea)); + const ReadType prv_scale_log = miopen::log(prv_scale); + const ReadType exp_scale = miopen::exp(ReadType(-beta) * prv_scale_log); + const ReadType prv_out = bot_in[PRE_PAD]; + const ReadType out_val = prv_out * exp_scale; + if(!low_channel_count || c_o < N_INPUTS) + { + // if the last one + if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1)) + { + for(int j = 0; j < C1x1_PIXLEFT; ++j) + { + const FLOAT* out_val_as_scalar = reinterpret_cast(&out_val); + top[top_off + j] = out_val_as_scalar[j]; +#if DO_SCALE + const FLOAT* prv_scale_as_scalar = reinterpret_cast(&prv_scale); + scale[scale_off + j] = prv_scale_as_scalar[j]; +#endif + } + } + else + { + FLOAT* top_offset = top + top_off; + *reinterpret_cast(top_offset) = out_val; + +#if DO_SCALE + FLOAT* scale_offset = scale + top_off; + *reinterpret_cast(scale_offset) = prv_scale; +#endif + } + } + } + + for(; c_i < N_INPUTS + PAD; c_i++, c_o++) + { + accum = accum - bot_in2[0]; + for(int i = 0; i < KERNEL_SIZE - 1; i++) + { + bot_in2[i] = bot_in2[i + 1]; + bot_in[i] = bot_in[i + 1]; + } + + top_off = batch * TOP_BATCH_STRIDE + c_o * TOP_CHANNEL_STRIDE + (pix_id * READ_UNIT); +#if DO_SCALE + scale_off = batch * SCALE_BATCH_STRIDE + c_o * SCALE_CHANNEL_STRIDE + (pix_id * READ_UNIT); +#endif + const ReadType prv_scale = (ReadType(K) + accum * ReadType(alphaoverarea)); + const ReadType prv_scale_log = miopen::log(prv_scale); + const ReadType exp_scale = miopen::exp(ReadType(-beta) * prv_scale_log); + const ReadType prv_out = bot_in[PRE_PAD]; + const ReadType out_val = prv_out * exp_scale; + if(!low_channel_count || c_o < N_INPUTS) + { + // if the last one + if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1)) + { + for(int j = 0; j < C1x1_PIXLEFT; ++j) + { + const FLOAT* out_val_as_scalar = reinterpret_cast(&out_val); + top[top_off + j] = out_val_as_scalar[j]; +#if DO_SCALE + const FLOAT* prv_scale_as_scalar = reinterpret_cast(&prv_scale); + scale[scale_off + j] = prv_scale_as_scalar[j]; +#endif + } + } + else + { + FLOAT* top_offset = top + top_off; + *reinterpret_cast(top_offset) = out_val; +#if DO_SCALE + FLOAT* scale_offset = scale + top_off; + *reinterpret_cast(scale_offset) = prv_scale; +#endif + } + } + } +} diff --git a/projects/miopen/src/kernels/hip_math_ops.hpp b/projects/miopen/src/kernels/hip_math_ops.hpp index b9864cda2d7d..7e51f7b59221 100644 --- a/projects/miopen/src/kernels/hip_math_ops.hpp +++ b/projects/miopen/src/kernels/hip_math_ops.hpp @@ -12,3 +12,8 @@ inline __device__ unsigned int iRemquo(unsigned int x, unsigned int y, unsigned k = x / y; return x - __mul24(k, y); } + +// Responsibility of caller to ensure that `u`, `d`, and `u*d` don't exceed the 24-bit range. +inline __device__ unsigned iMod(unsigned v, unsigned u, unsigned d) { return v - __mul24(u, d); } + +inline __device__ unsigned iDiv(unsigned v, unsigned d) { return v / d; } \ No newline at end of file diff --git a/projects/miopen/src/ocl/lrn_ocl.cpp b/projects/miopen/src/ocl/lrn_ocl.cpp index 07fad30e3ce5..7fbcc7ea8aa3 100644 --- a/projects/miopen/src/ocl/lrn_ocl.cpp +++ b/projects/miopen/src/ocl/lrn_ocl.cpp @@ -126,7 +126,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle, y, workSpace, as_float(f_norm_alphaoverarea), - as_float(f_norm_alpha), as_float(f_norm_beta), as_float(f_norm_K)); } @@ -135,7 +134,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle, kernels.front()(x, y, as_float(f_norm_alphaoverarea), - as_float(f_norm_alpha), as_float(f_norm_beta), as_float(f_norm_K)); } @@ -143,7 +141,7 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle, } else { - const std::string program_name = construct_params.getKernelFile(); // CL kernel filename + const std::string program_name = construct_params.getKernelFile(); // kernel filename const std::string kernel_name = construct_params.getKernelName(); // kernel name const std::string& compiler_parms = construct_params.getCompilerOptions(); // kernel parameters @@ -159,7 +157,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle, y, workSpace, as_float(f_norm_alphaoverarea), - as_float(f_norm_alpha), as_float(f_norm_beta), as_float(f_norm_K)); } @@ -168,7 +165,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle, obj(x, y, as_float(f_norm_alphaoverarea), - as_float(f_norm_alpha), as_float(f_norm_beta), as_float(f_norm_K)); } diff --git a/projects/miopen/src/ocl/mloNorm.cpp b/projects/miopen/src/ocl/mloNorm.cpp index fc58e1b4ccc7..946a2c04357d 100644 --- a/projects/miopen/src/ocl/mloNorm.cpp +++ b/projects/miopen/src/ocl/mloNorm.cpp @@ -49,10 +49,8 @@ inline bool is_tensor_packed(int c, int h, int w, int b_str, int c_str, int h_st return h_str == w && c_str == h * h_str && b_str == c * c_str; } -int mlo_construct_norm::mloConstructFwd() +void mlo_construct_norm::mloConstructFwd() { - int ret = 0; - size_t maxComputeUnits = _ctx.GetStream().GetMaxComputeUnits(); _hw_wave_sz = 64; @@ -63,14 +61,6 @@ int mlo_construct_norm::mloConstructFwd() if(pre_pad < 0 || pad < 0) MIOPEN_THROW("Wrong LRN kernel size"); - int top_df_stride = 1; - int top_df_channel_stride = 1; - int top_df_batch_stride = 1; - - int bot_df_stride = 1; - int bot_df_channel_stride = 1; - int bot_df_batch_stride = 1; - _grp_tile0 = (_problem.GetOutWidth() <= 16) ? 8 : 16; _grp_tile1 = 8; _out_pix_tile0 = 1; @@ -83,14 +73,14 @@ int mlo_construct_norm::mloConstructFwd() _problem.GetInChannelStride(), _problem.GetInStride()); - int MAP_SZ4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1); + int map_size_4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1); int read_unit; if(_norm_region == MLO_LRN_ACROSS_CHANNELS) { _grp_tile0 = (_problem.GetOutWidth() <= 8) ? 8 : 16; _grp_tile1 = (_problem.GetOutHeight() <= 8) ? 8 : 16; - read_unit = (MAP_SZ4 % 4 == 0) ? 4 : (MAP_SZ4 % 2 == 0) ? 2 : 1; - MAP_SZ4 /= read_unit; + read_unit = (map_size_4 % 4 == 0) ? 4 : (map_size_4 % 2 == 0) ? 2 : 1; + map_size_4 /= read_unit; } else { @@ -98,31 +88,26 @@ int mlo_construct_norm::mloConstructFwd() _out_pix_tile0 = (_problem.GetOutWidth() <= 8) ? 1 : 2; _out_pix_tile1 = (_problem.GetOutHeight() <= 8) ? 1 : 2; read_unit = 4; - MAP_SZ4 = (MAP_SZ4 + 3) / 4; + map_size_4 = (map_size_4 + 3) / 4; } - MAP_SZ4 *= (is_in_packed ? 1 : _problem.GetInHeight()); + map_size_4 *= (is_in_packed ? 1 : _problem.GetInHeight()); assert(_out_pix_tile0 - 1 <= _norm_area && _out_pix_tile1 - 1 <= _norm_area); - auto ocl_group_lg2sz0 = - static_cast(ceil(log(static_cast(_out_pix_tile0)) / std::numbers::ln2)); - auto ocl_group_lg2sz1 = - static_cast(ceil(log(static_cast(_out_pix_tile1)) / std::numbers::ln2)); - - _kernel_file = "MIOpenLRNFwd.cl"; + _kernel_file = "MIOpenLRNFwd.cpp"; _kernel_name = (_norm_region == MLO_LRN_ACROSS_CHANNELS) ? "MIOpenLRNAcrossChannels4" : "MIOpenLRNWithinChannel_PS"; if(_norm_region == MLO_LRN_ACROSS_CHANNELS) { _grp_tile0 = 8 * 8; _grp_tile1 = 1; - int n_waves = (_problem.GetBatchSize() * MAP_SZ4 + _hw_wave_sz - 1) / _hw_wave_sz; + int n_waves = (_problem.GetBatchSize() * map_size_4 + _hw_wave_sz - 1) / _hw_wave_sz; if(n_waves <= maxComputeUnits * 8) { - MAP_SZ4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1); - read_unit = (MAP_SZ4 % 2 == 0) ? 2 : 1; - MAP_SZ4 /= read_unit; - MAP_SZ4 *= (is_in_packed ? 1 : _problem.GetInHeight()); + map_size_4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1); + read_unit = (map_size_4 % 2 == 0) ? 2 : 1; + map_size_4 /= read_unit; + map_size_4 *= (is_in_packed ? 1 : _problem.GetInHeight()); } } @@ -135,8 +120,8 @@ int mlo_construct_norm::mloConstructFwd() { MIOPEN_LOG_I("Workaround for #1057: " << name << ',' << miopen::GetDataTypeName(_problem.GetInDataType()) << ',' - << MAP_SZ4 << ',' << read_unit); - MAP_SZ4 *= read_unit; + << map_size_4 << ',' << read_unit); + map_size_4 *= read_unit; read_unit = 1; } } @@ -151,96 +136,65 @@ int mlo_construct_norm::mloConstructFwd() auto g_wk_height = static_cast((_problem.GetOutHeight() + _grp_tile1 * _out_pix_tile1 - 1) / (_grp_tile1 * _out_pix_tile1)); - int OUT_VERT_ALIGNED = + int out_vert_aligned = (g_wk_height * (_grp_tile1 * _out_pix_tile1) == _problem.GetOutHeight()) ? 1 : 0; - int OUT_HORIZ_ALIGNED = + int out_horiz_aligned = (g_wk_width * (_grp_tile0 * _out_pix_tile0) == _problem.GetOutWidth()) ? 1 : 0; // currently always 1 - int DIVBY4 = (MAP_SZ4 * read_unit == _problem.GetInWidth() * _problem.GetInHeight()) ? 1 : 0; - int C1x1_PIXLEFT = - (DIVBY4 == 1) ? 0 - : _problem.GetInWidth() * _problem.GetInHeight() - (MAP_SZ4 - 1) * read_unit; - - std::string READ_TYPE = - (read_unit == 1) ? "_FLOAT" : "_FLOAT" + std::to_string(static_cast(read_unit)); - + bool div_by_4 = (map_size_4 * read_unit == _problem.GetInWidth() * _problem.GetInHeight()); + int c1x1_pixleft = + div_by_4 ? 0 + : _problem.GetInWidth() * _problem.GetInHeight() - (map_size_4 - 1) * read_unit; _comp_options = - std::string(" -DMLO_LRN_KERNEL_SZ=") + std::to_string(static_cast(_norm_area)) + - std::string(" -DMLO_LRN_PAD=") + std::to_string(static_cast(pad)) + - std::string(" -DMLO_LRN_KERNEL_SZ1=") + std::to_string(static_cast(_norm_area)) + - std::string(" -DMLO_LRN_PAD1=") + std::to_string(static_cast(pad)) + - std::string(" -DMLO_LRN_KERNEL_SZ0=") + std::to_string(static_cast(_norm_area)) + - std::string(" -DMLO_LRN_PAD0=") + std::to_string(static_cast(pad)) + - std::string(" -DMLO_LRN_PRE_PAD=") + std::to_string(static_cast(pre_pad)) + - std::string(" -DMLO_LRN_PRE_PAD1=") + std::to_string(static_cast(pre_pad)) + - std::string(" -DMLO_LRN_PRE_PAD0=") + std::to_string(static_cast(pre_pad)) + - std::string(" -DMLO_LRN_N_OUTPUTS=") + + std::string(" -DKERNEL_SIZE=") + std::to_string(static_cast(_norm_area)) + + std::string(" -DPAD=") + std::to_string(static_cast(pad)) + + std::string(" -DKERNEL_SIZE1=") + std::to_string(static_cast(_norm_area)) + + std::string(" -DPAD0=") + std::to_string(static_cast(pad)) + + std::string(" -DPRE_PAD=") + std::to_string(static_cast(pre_pad)) + + std::string(" -DPRE_PAD1=") + std::to_string(static_cast(pre_pad)) + + std::string(" -DKERNEL_SIZE0=") + std::to_string(static_cast(_norm_area)) + + std::string(" -DPRE_PAD0=") + std::to_string(static_cast(pre_pad)) + + std::string(" -DN_OUTPUTS=") + std::to_string(static_cast(_problem.GetOutChannels())) + - std::string(" -DMLO_LRN_N_INPUTS=") + + std::string(" -DN_INPUTS=") + std::to_string(static_cast(_problem.GetInChannels())) + - std::string(" -DMLO_LRN_N_HORIZ_OUT_PIX=") + - std::to_string(static_cast(_out_pix_tile0)) + - std::string(" -DMLO_LRN_N_VERT_OUT_PIX=") + - std::to_string(static_cast(_out_pix_tile1)) + - std::string(" -DMLO_LRN_GROUP_SZ0=") + std::to_string(static_cast(_grp_tile0)) + - std::string(" -DMLO_LRN_GROUP_SZ1=") + std::to_string(static_cast(_grp_tile1)) + - std::string(" -DMLO_LRN_GROUP_LG2SZ0=") + - std::to_string(static_cast(ocl_group_lg2sz0)) + - std::string(" -DMLO_LRN_GROUP_LG2SZ1=") + - std::to_string(static_cast(ocl_group_lg2sz1)) + - std::string(" -DMLO_LRN_BOT_BATCH_STRIDE=") + + std::string(" -DHORIZ_OUT_PIX=") + std::to_string(static_cast(_out_pix_tile0)) + + std::string(" -DVERT_OUT_PIX=") + std::to_string(static_cast(_out_pix_tile1)) + + std::string(" -DGROUP_SIZE_X=") + std::to_string(static_cast(_grp_tile0)) + + std::string(" -DGROUP_SIZE_Y=") + std::to_string(static_cast(_grp_tile1)) + + std::string(" -DBOT_BATCH_STRIDE=") + std::to_string(static_cast(_problem.GetInBatchStride())) + - std::string(" -DMLO_LRN_BOT_CHANNEL_STRIDE=") + + std::string(" -DBOT_CHANNEL_STRIDE=") + std::to_string(static_cast(_problem.GetInChannelStride())) + - std::string(" -DMLO_LRN_BOT_STRIDE=") + + std::string(" -DBOT_STRIDE=") + std::to_string(static_cast(_problem.GetInStride())) + - std::string(" -DMLO_LRN_TOP_BATCH_STRIDE=") + + std::string(" -DTOP_BATCH_STRIDE=") + std::to_string(static_cast(_problem.GetOutBatchStride())) + - std::string(" -DMLO_LRN_TOP_CHANNEL_STRIDE=") + + std::string(" -DTOP_CHANNEL_STRIDE=") + std::to_string(static_cast(_problem.GetOutChannelStride())) + - std::string(" -DMLO_LRN_TOP_STRIDE=") + + std::string(" -DTOP_STRIDE=") + std::to_string(static_cast(_problem.GetOutStride())) + - std::string(" -DMLO_LRN_BOT_WIDTH=") + + std::string(" -DBOT_WIDTH=") + std::to_string(static_cast(_problem.GetOutWidth())) + - std::string(" -DMLO_LRN_BOT_HEIGHT=") + + std::string(" -DBOT_HEIGHT=") + std::to_string(static_cast(_problem.GetOutHeight())) + - std::string(" -DMLO_LRN_TOP_WIDTH=") + + std::string(" -DTOP_WIDTH=") + std::to_string(static_cast(_problem.GetOutWidth())) + - std::string(" -DMLO_LRN_TOP_HEIGHT=") + + std::string(" -DTOP_HEIGHT=") + std::to_string(static_cast(_problem.GetOutHeight())) + - std::string(" -DMLO_LRN_SCALE_BATCH_STRIDE=") + + std::string(" -DSCALE_BATCH_STRIDE=") + std::to_string(static_cast(scale_batch_stride)) + - std::string(" -DMLO_LRN_SCALE_CHANNEL_STRIDE=") + + std::string(" -DSCALE_CHANNEL_STRIDE=") + std::to_string(static_cast(scale_channel_stride)) + - std::string(" -DMLO_LRN_SCALE_STRIDE=") + - std::to_string(static_cast(scale_stride)) + - std::string(" -DMLO_LRN_TOPDF_BATCH_STRIDE=") + - std::to_string(static_cast(top_df_batch_stride)) + - std::string(" -DMLO_LRN_TOPDF_CHANNEL_STRIDE=") + - std::to_string(static_cast(top_df_channel_stride)) + - std::string(" -DMLO_LRN_TOPDF_STRIDE=") + - std::to_string(static_cast(top_df_stride)) + - std::string(" -DMLO_LRN_BOTDF_BATCH_STRIDE=") + - std::to_string(static_cast(bot_df_batch_stride)) + - std::string(" -DMLO_LRN_BOTDF_CHANNEL_STRIDE=") + - std::to_string(static_cast(bot_df_channel_stride)) + - std::string(" -DMLO_LRN_BOTDF_STRIDE=") + - std::to_string(static_cast(bot_df_stride)) + - std::string(" -DMLO_LRN_BATCH_SZ=") + + std::string(" -DSCALE_STRIDE=") + std::to_string(static_cast(scale_stride)) + + std::string(" -DBATCH_SIZE=") + std::to_string(static_cast(_problem.GetBatchSize())) + - std::string(" -DMLO_LRN_N_INPUTS=") + - std::to_string(static_cast(_problem.GetInChannels())) + - std::string(" -DMLO_LRN_N_OUTPUTS=") + - std::to_string(static_cast(_problem.GetOutChannels())) + - std::string(" -DMLO_LRN_DO_SCALE=") + std::to_string(static_cast(scale)) + - std::string(" -DMLO_OUT_VERT_ALIGNED=") + - std::to_string(static_cast(OUT_VERT_ALIGNED)) + - std::string(" -DMLO_OUT_HORIZ_ALIGNED=") + - std::to_string(static_cast(OUT_HORIZ_ALIGNED)) + std::string(" -DMLO_MAP_SZ4=") + - std::to_string(static_cast(MAP_SZ4)) + std::string(" -DMLO_C1x1_PIXLEFT=") + - std::to_string(static_cast(C1x1_PIXLEFT)) + std::string(" -DMLO_DIVBY4=") + - std::to_string(static_cast(DIVBY4)) + std::string(" -DMLO_READ_TYPE=") + - READ_TYPE + std::string(" -DMLO_READ_UNIT=") + + std::string(" -DDO_SCALE=") + std::to_string(static_cast(scale)) + + std::string(" -DVERT_ALIGNED=") + std::to_string(static_cast(out_vert_aligned)) + + std::string(" -DHORIZ_ALIGNED=") + + std::to_string(static_cast(out_horiz_aligned)) + std::string(" -DMAP_SZ_4=") + + std::to_string(static_cast(map_size_4)) + std::string(" -DC1x1_PIXLEFT=") + + std::to_string(static_cast(c1x1_pixleft)) + std::string(" -DREAD_UNIT=") + std::to_string(static_cast(read_unit)) + getGeneralCompOptions(); _l_wk.clear(); @@ -252,7 +206,7 @@ int mlo_construct_norm::mloConstructFwd() if(_norm_region == MLO_LRN_ACROSS_CHANNELS) { - _g_wk.push_back(MAP_SZ4); + _g_wk.push_back(map_size_4); _g_wk.push_back(1); _g_wk.push_back(_problem.GetBatchSize()); } @@ -268,14 +222,10 @@ int mlo_construct_norm::mloConstructFwd() // calculate workspace size_t scale_sz = static_cast(_problem.GetBatchSize()) * scale_batch_stride * data_len; _workspace_sz = (doBackward()) ? scale_sz : 0; - - return (ret); } -int mlo_construct_norm::mloConstructBwd() +void mlo_construct_norm::mloConstructBwd() { - int ret = 0; - _out_pix_tile0 = 1; _out_pix_tile1 = 1; _grp_tile0 = 8; @@ -290,10 +240,6 @@ int mlo_construct_norm::mloConstructBwd() _out_pix_tile0 = (_in_df_width <= 8) ? 1 : (_in_df_width <= 16) ? 2 : 4; _out_pix_tile1 = (_in_df_height <= 8) ? 1 : (_in_df_height <= 16) ? 2 : 4; } - auto ocl_group_lg2sz0 = - static_cast(ceil(log(static_cast(_grp_tile0)) / std::numbers::ln2)); - auto ocl_group_lg2sz1 = - static_cast(ceil(log(static_cast(_grp_tile1)) / std::numbers::ln2)); int pre_pad = (_norm_area - 1) / 2; int pad = _norm_area - pre_pad - 1; @@ -305,69 +251,55 @@ int mlo_construct_norm::mloConstructBwd() MIOPEN_THROW("Wrong LRN kernel size"); _comp_options = - std::string(" -DMLO_LRN_KERNEL_SZ=") + std::to_string(static_cast(_norm_area)) + - std::string(" -DMLO_LRN_N_OUTPUTS=") + - std::to_string(static_cast(_problem.GetOutChannels())) + - std::string(" -DMLO_LRN_N_CHANNELS=") + - std::to_string(static_cast(_problem.GetInChannels())) + - std::string(" -DMLO_LRN_PAD=") + std::to_string(static_cast(pad)) + - std::string(" -DMLO_LRN_PRE_PAD=") + std::to_string(static_cast(pre_pad)) + - std::string(" -DMLO_LRN_N_HORIZ_OUT_PIX=") + - std::to_string(static_cast(_out_pix_tile0)) + - std::string(" -DMLO_LRN_N_VERT_OUT_PIX=") + - std::to_string(static_cast(_out_pix_tile1)) + - std::string(" -DMLO_LRN_GROUP_SZ0=") + std::to_string(static_cast(_grp_tile0)) + - std::string(" -DMLO_LRN_GROUP_SZ1=") + std::to_string(static_cast(_grp_tile1)) + - std::string(" -DMLO_LRN_GROUP_LG2SZ0=") + - std::to_string(static_cast(ocl_group_lg2sz0)) + - std::string(" -DMLO_LRN_GROUP_LG2SZ1=") + - std::to_string(static_cast(ocl_group_lg2sz1)) + - std::string(" -DMLO_LRN_BOT_BATCH_STRIDE=") + + std::string(" -DKERNEL_SIZE=") + std::to_string(static_cast(_norm_area)) + + std::string(" -DOUT_CHANNELS=") + + std::to_string(static_cast(_problem.GetOutChannels())) + std::string(" -DPAD=") + + std::to_string(static_cast(pad)) + std::string(" -DPRE_PAD=") + + std::to_string(static_cast(pre_pad)) + std::string(" -DHORIZ_OUT_PIX=") + + std::to_string(static_cast(_out_pix_tile0)) + std::string(" -DVERT_OUT_PIX=") + + std::to_string(static_cast(_out_pix_tile1)) + std::string(" -DGROUP_SIZE_X=") + + std::to_string(static_cast(_grp_tile0)) + std::string(" -DGROUP_SIZE_Y=") + + std::to_string(static_cast(_grp_tile1)) + std::string(" -DBOT_BATCH_STRIDE=") + std::to_string(static_cast(_problem.GetInBatchStride())) + - std::string(" -DMLO_LRN_BOT_CHANNEL_STRIDE=") + + std::string(" -DBOT_CHANNEL_STRIDE=") + std::to_string(static_cast(_problem.GetInChannelStride())) + - std::string(" -DMLO_LRN_BOT_STRIDE=") + + std::string(" -DBOT_STRIDE=") + std::to_string(static_cast(_problem.GetInStride())) + - std::string(" -DMLO_LRN_TOP_BATCH_STRIDE=") + + std::string(" -DTOP_BATCH_STRIDE=") + std::to_string(static_cast(_problem.GetOutBatchStride())) + - std::string(" -DMLO_LRN_TOP_CHANNEL_STRIDE=") + + std::string(" -DTOP_CHANNEL_STRIDE=") + std::to_string(static_cast(_problem.GetOutChannelStride())) + - std::string(" -DMLO_LRN_TOP_STRIDE=") + + std::string(" -DTOP_STRIDE=") + std::to_string(static_cast(_problem.GetOutStride())) + - std::string(" -DMLO_LRN_BOT_WIDTH=") + + std::string(" -DBOT_WIDTH=") + std::to_string(static_cast(_problem.GetInWidth())) + - std::string(" -DMLO_LRN_BOT_HEIGHT=") + + std::string(" -DBOT_HEIGHT=") + std::to_string(static_cast(_problem.GetInHeight())) + - std::string(" -DMLO_LRN_TOP_WIDTH=") + + std::string(" -DTOP_WIDTH=") + std::to_string(static_cast(_problem.GetOutWidth())) + - std::string(" -DMLO_LRN_TOP_HEIGHT=") + + std::string(" -DTOP_HEIGHT=") + std::to_string(static_cast(_problem.GetOutHeight())) + - std::string(" -DMLO_LRN_SCALE_BATCH_STRIDE=") + + std::string(" -DSCALE_BATCH_STRIDE=") + std::to_string(static_cast(scale_batch_stride)) + - std::string(" -DMLO_LRN_SCALE_CHANNEL_STRIDE=") + + std::string(" -DSCALE_CHANNEL_STRIDE=") + std::to_string(static_cast(scale_channel_stride)) + - std::string(" -DMLO_LRN_SCALE_STRIDE=") + - std::to_string(static_cast(scale_stride)) + - std::string(" -DMLO_LRN_TOPDF_BATCH_STRIDE=") + + std::string(" -DSCALE_STRIDE=") + std::to_string(static_cast(scale_stride)) + + std::string(" -DTOPDF_BATCH_STRIDE=") + std::to_string(static_cast(_out_df_batch_stride)) + - std::string(" -DMLO_LRN_TOPDF_CHANNEL_STRIDE=") + + std::string(" -DTOPDF_CHANNEL_STRIDE=") + std::to_string(static_cast(_out_df_channel_stride)) + - std::string(" -DMLO_LRN_TOPDF_STRIDE=") + - std::to_string(static_cast(_out_df_stride)) + - std::string(" -DMLO_LRN_BOTDF_BATCH_STRIDE=") + + std::string(" -DTOPDF_STRIDE=") + std::to_string(static_cast(_out_df_stride)) + + std::string(" -DBOTDF_BATCH_STRIDE=") + std::to_string(static_cast(_in_df_batch_stride)) + - std::string(" -DMLO_LRN_BOTDF_CHANNEL_STRIDE=") + + std::string(" -DBOTDF_CHANNEL_STRIDE=") + std::to_string(static_cast(_in_df_channel_stride)) + - std::string(" -DMLO_LRN_BOTDF_STRIDE=") + - std::to_string(static_cast(_in_df_stride)) + - std::string(" -DMLO_LRN_BATCH_SZ=") + + std::string(" -DBOTDF_STRIDE=") + std::to_string(static_cast(_in_df_stride)) + + std::string(" -DBATCH_SIZE=") + std::to_string(static_cast(_problem.GetBatchSize())) + - std::string(" -DMLO_LRN_N_INPUTS=") + - std::to_string(static_cast(_problem.GetInChannels())) + - std::string(" -DMLO_LRN_N_OUTPUTS=") + - std::to_string(static_cast(_problem.GetOutChannels())) + getGeneralCompOptions(); + std::string(" -DN_INPUTS=") + + std::to_string(static_cast(_problem.GetInChannels())) + getGeneralCompOptions(); - _kernel_file = "MIOpenLRNBwd.cl"; + _kernel_file = "MIOpenLRNBwd.cpp"; _l_wk.clear(); _g_wk.clear(); @@ -394,6 +326,4 @@ int mlo_construct_norm::mloConstructBwd() _g_wk.push_back(static_cast(_problem.GetInChannels()) * _problem.GetBatchSize()); _kernel_name = "MIOpenLRNWithinChannelBwd"; } - - return (ret); }