diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt
index b34e5ddf2325..fa8383e82c05 100644
--- a/projects/miopen/src/CMakeLists.txt
+++ b/projects/miopen/src/CMakeLists.txt
@@ -574,8 +574,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenGetitem.cpp
         kernels/MIOpenKthvalue.cpp
         kernels/MIOpenLayerNorm.cpp
-        kernels/MIOpenLRNBwd.cl
-        kernels/MIOpenLRNFwd.cl
+        kernels/MIOpenLRNBwd.cpp
+        kernels/MIOpenLRNFwd.cpp
         kernels/MIOpenMultiMarginLoss.cpp
         kernels/MIOpenNeuron.cpp
         kernels/MIOpenPReLU.cpp
diff --git a/projects/miopen/src/include/miopen/mlo_internal.hpp b/projects/miopen/src/include/miopen/mlo_internal.hpp
index 89988f80a877..62ccab567922 100644
--- a/projects/miopen/src/include/miopen/mlo_internal.hpp
+++ b/projects/miopen/src/include/miopen/mlo_internal.hpp
@@ -557,8 +557,8 @@ struct mlo_construct_norm : mlo_construct_activ_lrn_pooling_common
     void mloConstruct();
 
 protected:
-    int mloConstructFwd();
-    int mloConstructBwd();
+    void mloConstructFwd();
+    void mloConstructBwd();
     int _norm_region  = 0;
     int _norm_area    = 0;
     double _normAlpha = 0.0;
diff --git a/projects/miopen/src/kernels/MIOpenLRNBwd.cl b/projects/miopen/src/kernels/MIOpenLRNBwd.cl
deleted file mode 100644
index 4e60bf133a66..000000000000
--- a/projects/miopen/src/kernels/MIOpenLRNBwd.cl
+++ /dev/null
@@ -1,472 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#define PPCAT_NX(A, B) A##B
-#define PPCAT(A, B) PPCAT_NX(A, B)
-#define TWO 2
-#define THREE 3
-#define FOUR 4
-#define EIGHT 8
-
-#define DBG_RANGE 0
-#if MIOPEN_USE_FP16 == 1
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define _FLOAT half
-#endif
-#if MIOPEN_USE_FP32 == 1
-#define _FLOAT float
-#endif
-
-#define _FLOAT2 PPCAT(_FLOAT, TWO)
-#define _FLOAT3 PPCAT(_FLOAT, THREE)
-#define _FLOAT4 PPCAT(_FLOAT, FOUR)
-#define _FLOAT8 PPCAT(_FLOAT, EIGHT)
-
-#define UNUSED __attribute__((__unused__))
-
-#define MLO_LRN_GROUP_SZ2 1
-#define MLO_LRN_STRIDE 1
-
-#define MLO_LRN_LCL_DATA_WIDTH (MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_KERNEL_SZ - 1)
-#define MLO_LRN_LCL_DATA_HEIGHT (MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX + MLO_LRN_KERNEL_SZ - 1)
-#define MLO_LRN_GROUP_SZ (MLO_LRN_GROUP_SZ2 * MLO_LRN_GROUP_SZ1 * MLO_LRN_GROUP_SZ0)
-// #define MLO_LRN_PREPAD_SZ (MLO_LRN_KERNEL_SZ - 1)/2
-
-struct LRNForwardParam
-{
-    _FLOAT alphaoverarea;
-    _FLOAT alpha;
-    _FLOAT beta;
-    _FLOAT K;
-};
-
-struct LRNBackwardParam
-{
-    _FLOAT ratio;
-    _FLOAT alpha;
-    _FLOAT beta;
-};
-
-/*
-
-This is a naive implementation.
-The "sliding window" -based implementation is in MIOpenLRNFwd.cl file
-
-*/
-
-__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2)))
-__kernel void
-MIOpenLRNWithinChannelBwd(const __global _FLOAT* top,
-                          const __global _FLOAT* bot,
-                          const __global _FLOAT* top_df,
-                          const __global _FLOAT* scale,
-                          __global _FLOAT* bot_df,
-                          UNUSED _FLOAT ratio,
-                          _FLOAT alpha,
-                          _FLOAT beta)
-{
-    __local _FLOAT top_df_data[MLO_LRN_LCL_DATA_WIDTH * MLO_LRN_LCL_DATA_HEIGHT];
-    __local _FLOAT ratio_data[MLO_LRN_LCL_DATA_WIDTH * MLO_LRN_LCL_DATA_HEIGHT];
-    int x          = get_group_id(0) * MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX;
-    int y          = get_group_id(1) * MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX;
-    int lcl_id0    = get_local_id(0);
-    int lcl_id1    = get_local_id(1);
-    int ob         = get_global_id(2); // output * batch_sz
-    int o          = ob / MLO_LRN_BATCH_SZ;
-    int b          = ob - o * MLO_LRN_BATCH_SZ;
-    int top_x      = x;
-    int top_y      = y;
-    int top_df_off = b * MLO_LRN_TOPDF_BATCH_STRIDE + o * MLO_LRN_TOPDF_CHANNEL_STRIDE;
-    int scale_off  = b * MLO_LRN_SCALE_BATCH_STRIDE + o * MLO_LRN_SCALE_CHANNEL_STRIDE;
-    int bot_x      = x + lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX;
-    int bot_y      = y + lcl_id1 * MLO_LRN_N_VERT_OUT_PIX;
-
-    _FLOAT prv_exp_scale[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX];
-    //		_FLOAT prv_top_df[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX];
-
-    // load top_diff and scale tiles
-    for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1)
-    {
-        int top_y_act = top_y + b_j - MLO_LRN_PAD;
-
-        bool invisibleY = (top_y_act < 0) || (top_y_act >= MLO_LRN_TOP_HEIGHT);
-
-        top_y_act = (invisibleY) ? 0 : top_y_act;
-
-        int top_df_y_off = top_y_act * MLO_LRN_TOPDF_STRIDE;
-        int scale_y_off  = top_y_act * MLO_LRN_SCALE_STRIDE;
-
-        int lcl_off_v = b_j * MLO_LRN_LCL_DATA_WIDTH;
-
-        for(int b_i = lcl_id0; b_i < MLO_LRN_LCL_DATA_WIDTH; b_i += MLO_LRN_GROUP_SZ0)
-        {
-
-            int top_x_act = top_x + b_i - MLO_LRN_PAD;
-
-            bool invisibleX = (top_x_act < 0) || (top_x_act >= MLO_LRN_TOP_WIDTH);
-
-            top_x_act = (invisibleX) ? 0 : top_x_act;
-#if DBG_RANGE
-            if(top_df_off + top_df_y_off + top_x_act >=
-               MLO_LRN_BATCH_SZ * MLO_LRN_TOPDF_BATCH_STRIDE)
-            {
-                printf("K:err:topdf-off_range\n");
-            }
-#endif
-            _FLOAT top_df_val = top_df[top_df_off + top_df_y_off + top_x_act];
-            _FLOAT scale_val  = scale[scale_off + scale_y_off + top_x_act];
-
-            top_df_val = (invisibleX || invisibleY) ? 0 : top_df_val;
-            scale_val  = (invisibleX || invisibleY) ? (_FLOAT)1.f : scale_val;
-
-            top_df_data[lcl_off_v + b_i] = top_df_val;
-            ratio_data[lcl_off_v + b_i]  = scale_val;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // actual top_diffs and scales
-    for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j)
-    {
-        int lcl_off_v =
-            (lcl_id1 * MLO_LRN_N_VERT_OUT_PIX + MLO_LRN_PAD + j) * MLO_LRN_LCL_DATA_WIDTH;
-        for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++)
-        {
-            _FLOAT scale_ratio =
-                ratio_data[lcl_off_v + lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_PAD + i];
-            prv_exp_scale[j][i] = exp(-beta * log(scale_ratio));
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // read top and load ratio tile
-    int top_off = b * MLO_LRN_TOP_BATCH_STRIDE + o * MLO_LRN_TOP_CHANNEL_STRIDE;
-    for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1)
-    {
-        int top_y_act = top_y + b_j - MLO_LRN_PAD;
-
-        bool invisibleY = (top_y_act < 0) || (top_y_act >= MLO_LRN_TOP_HEIGHT);
-
-        top_y_act = (invisibleY) ? 0 : top_y_act;
-
-        int top_y_off = top_y_act * MLO_LRN_TOP_STRIDE;
-
-        int lcl_off_v = b_j * MLO_LRN_LCL_DATA_WIDTH;
-
-        for(int b_i = lcl_id0; b_i < MLO_LRN_LCL_DATA_WIDTH; b_i += MLO_LRN_GROUP_SZ0)
-        {
-
-            int top_x_act = top_x + b_i - MLO_LRN_PAD;
-
-            bool invisibleX = (top_x_act < 0) || (top_x_act >= MLO_LRN_TOP_WIDTH);
-
-            top_x_act = (invisibleX) ? 0 : top_x_act;
-#if DBG_RANGE
-
-            if(top_off + top_y_off + top_x_act >= MLO_LRN_BATCH_SZ * MLO_LRN_TOP_BATCH_STRIDE)
-            {
-                printf("K:err:top-off_range\n");
-            }
-#endif
-
-            _FLOAT top_val = top[top_off + top_y_off + top_x_act];
-
-            top_val = (invisibleX || invisibleY) ? 0 : top_val;
-
-            _FLOAT top_df_val = top_df_data[lcl_off_v + b_i];
-
-            _FLOAT scale_val = ratio_data[lcl_off_v + b_i];
-
-            // scale val is not 0
-            _FLOAT ratio_dta = (top_df_val * top_val) / scale_val;
-            // replacing scale with ratio
-            ratio_data[lcl_off_v + b_i] = ratio_dta;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // caculate bot diff
-    _FLOAT prv_bot_diff[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX];
-
-    for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j)
-    {
-        int v_off_v = (lcl_id1 * MLO_LRN_N_VERT_OUT_PIX + j);
-        int hstart  = y + v_off_v - MLO_LRN_PAD;
-        int hend    = min(hstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_HEIGHT + MLO_LRN_PRE_PAD);
-
-        // accum offset, vertical
-        //			int lcl_a_off_v = v_off_v *  MLO_LRN_LCL_DATA_WIDTH;
-        // value offset, vertical
-        int lcl_v_off_v = (v_off_v + MLO_LRN_PAD) * MLO_LRN_LCL_DATA_WIDTH;
-        for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++)
-        {
-            _FLOAT prv_ratio_accum = (_FLOAT)0;
-            int v_off_h            = lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + i;
-
-            int wstart = x + v_off_h - MLO_LRN_PAD;
-            int wend   = min(wstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_WIDTH + MLO_LRN_PRE_PAD);
-
-            int adj_area_size = (hend - hstart) * (wend - wstart);
-
-            // accum offset, horiz
-            int lcl_a_off_h = v_off_h;
-            //	value offset, horiz
-            int lcl_v_off_h = lcl_a_off_h + MLO_LRN_PAD;
-
-            for(int k = 0; k < MLO_LRN_KERNEL_SZ; k++)
-            {
-                for(int l = 0; l < MLO_LRN_KERNEL_SZ; l++)
-                {
-                    prv_ratio_accum +=
-                        ratio_data[(v_off_v + k) * MLO_LRN_LCL_DATA_WIDTH + lcl_a_off_h + l];
-                }
-            }
-
-            _FLOAT top_df_val = top_df_data[lcl_v_off_v + lcl_v_off_h];
-
-            uint bot_off0 = MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * o +
-                            MLO_LRN_BOT_STRIDE * (y + v_off_v) + x + v_off_h;
-
-            uint bot_off = (y + v_off_v < MLO_LRN_BOT_HEIGHT && x + v_off_h < MLO_LRN_BOT_WIDTH &&
-                            b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS)
-                               ? bot_off0
-                               : MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE - 1;
-#if DBG_RANGE
-
-            if(bot_off >= MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE)
-            {
-                printf("K:err:bot-off_range\n");
-            }
-#endif
-            _FLOAT bot_dta = bot[bot_off];
-
-            bot_dta = (y + v_off_v < MLO_LRN_BOT_HEIGHT && x + v_off_h < MLO_LRN_BOT_WIDTH &&
-                       b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS)
-                          ? bot_dta
-                          : 0;
-
-            _FLOAT adj_ratio       = (_FLOAT)2.f * alpha * beta / adj_area_size;
-            _FLOAT prv_accum_ratio = adj_ratio * bot_dta * prv_ratio_accum;
-            prv_bot_diff[j][i]     = prv_exp_scale[j][i] * top_df_val - prv_accum_ratio;
-        }
-    }
-
-    for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; j++)
-    {
-        for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++)
-        {
-            if(bot_y + j < MLO_LRN_BOT_HEIGHT && bot_x + i < MLO_LRN_BOT_WIDTH &&
-               b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS)
-            {
-#if DBG_RANGE
-
-                if(MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * o +
-                       MLO_LRN_BOTDF_STRIDE * (bot_y + j) + bot_x + i >=
-                   MLO_LRN_BATCH_SZ * MLO_LRN_BOTDF_BATCH_STRIDE)
-                {
-                    printf("K:err:botdf-off_range\n");
-                }
-#endif
-                bot_df[MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * o +
-                       MLO_LRN_BOTDF_STRIDE * (bot_y + j) + bot_x + i] = prv_bot_diff[j][i];
-            }
-        }
-    }
-}
-
-#if(MLO_LRN_N_INPUTS < MLO_LRN_KERNEL_SZ)
-#define MLO_LOW_CHNL_COUNT 1
-#else
-#define MLO_LOW_CHNL_COUNT 0
-#endif
-
-__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2)))
-__kernel void
-MIOpenLRNAcrossChannelsBwd1(const __global _FLOAT* top,
-                            const __global _FLOAT* bot,
-                            const __global _FLOAT* top_df,
-                            const __global _FLOAT* scale,
-                            __global _FLOAT* bot_df,
-                            _FLOAT ratio,
-                            UNUSED _FLOAT alpha,
-                            _FLOAT beta)
-{
-    int x              = get_global_id(0); // channel x
-    int y              = get_global_id(1); // channel y
-    int b              = get_global_id(2); // batch
-    _FLOAT accum_ratio = 0;
-    _FLOAT top_df_in[MLO_LRN_KERNEL_SZ];
-    _FLOAT scale_in[MLO_LRN_KERNEL_SZ];
-    _FLOAT ratio_dta[MLO_LRN_KERNEL_SZ];
-    int c_i = 0, c_o = 0;
-    int bot_df_off = 0;
-
-    for(c_i = 0; c_i < MLO_LRN_PRE_PAD; c_i++)
-    {
-
-        top_df_in[c_i] = top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b +
-                                MLO_LRN_TOPDF_CHANNEL_STRIDE * c_i + MLO_LRN_TOPDF_STRIDE * y + x];
-        scale_in[c_i]  = scale[MLO_LRN_SCALE_BATCH_STRIDE * b + MLO_LRN_SCALE_CHANNEL_STRIDE * c_i +
-                              MLO_LRN_SCALE_STRIDE * y + x];
-        _FLOAT top_dta = top[MLO_LRN_TOP_BATCH_STRIDE * b + MLO_LRN_TOP_CHANNEL_STRIDE * c_i +
-                             MLO_LRN_TOP_STRIDE * y + x];
-
-        ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i];
-
-#if MLO_LOW_CHNL_COUNT == 1
-        ratio_dta[c_i] = (c_i < MLO_LRN_N_OUTPUTS) ? ratio_dta[c_i] : 0;
-#endif
-
-        accum_ratio = accum_ratio + ratio_dta[c_i];
-    }
-
-    for(; c_i < MLO_LRN_KERNEL_SZ; c_i++, c_o++)
-    {
-        top_df_in[c_i] = top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b +
-                                MLO_LRN_TOPDF_CHANNEL_STRIDE * c_i + MLO_LRN_TOPDF_STRIDE * y + x];
-        scale_in[c_i]  = scale[MLO_LRN_SCALE_BATCH_STRIDE * b + MLO_LRN_SCALE_CHANNEL_STRIDE * c_i +
-                              MLO_LRN_SCALE_STRIDE * y + x];
-        _FLOAT top_dta = top[MLO_LRN_TOP_BATCH_STRIDE * b + MLO_LRN_TOP_CHANNEL_STRIDE * c_i +
-                             MLO_LRN_TOP_STRIDE * y + x];
-        ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i];
-#if MLO_LOW_CHNL_COUNT == 1
-        ratio_dta[c_i] = (c_i < MLO_LRN_N_OUTPUTS) ? ratio_dta[c_i] : 0;
-#endif
-
-        accum_ratio = accum_ratio + ratio_dta[c_i];
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_o < MLO_LRN_N_INPUTS)
-#endif
-        {
-            _FLOAT bot_dta = bot[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_o +
-                                 MLO_LRN_BOT_STRIDE * y + x];
-
-            _FLOAT prv_scale = scale_in[c_o];
-
-            _FLOAT exp_scale = exp(-beta * log(prv_scale));
-            //					pow(prv_scale, -beta);
-
-            _FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio;
-
-            _FLOAT out_val = top_df_in[c_o] * exp_scale - prv_accum_ratio;
-
-            bot_df_off = MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * c_o +
-                         MLO_LRN_BOTDF_STRIDE * y + x;
-
-            bot_df[bot_df_off] = out_val;
-        }
-    }
-
-    for(; c_i < MLO_LRN_N_INPUTS; c_i++, c_o++)
-    {
-
-        _FLOAT prv_top_df_in =
-            top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b + MLO_LRN_TOPDF_CHANNEL_STRIDE * c_i +
-                   MLO_LRN_TOPDF_STRIDE * y + x];
-        _FLOAT prv_scale_in =
-            scale[MLO_LRN_SCALE_BATCH_STRIDE * b + MLO_LRN_SCALE_CHANNEL_STRIDE * c_i +
-                  MLO_LRN_SCALE_STRIDE * y + x];
-        _FLOAT top_dta       = top[MLO_LRN_TOP_BATCH_STRIDE * b + MLO_LRN_TOP_CHANNEL_STRIDE * c_i +
-                             MLO_LRN_TOP_STRIDE * y + x];
-        _FLOAT prv_ratio_dta = prv_top_df_in * top_dta / prv_scale_in;
-#if MLO_LOW_CHNL_COUNT == 1
-        prv_ratio_dta = (c_i < MLO_LRN_N_OUTPUTS) ? prv_ratio_dta : 0;
-#endif
-
-        accum_ratio = accum_ratio + prv_ratio_dta;
-
-        accum_ratio = accum_ratio - ratio_dta[0];
-
-        for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++)
-        {
-            top_df_in[i] = top_df_in[i + 1];
-            scale_in[i]  = scale_in[i + 1];
-            ratio_dta[i] = ratio_dta[i + 1];
-        }
-
-        top_df_in[MLO_LRN_KERNEL_SZ - 1] = prv_top_df_in;
-        scale_in[MLO_LRN_KERNEL_SZ - 1]  = prv_scale_in;
-        ratio_dta[MLO_LRN_KERNEL_SZ - 1] = prv_ratio_dta;
-
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_o < MLO_LRN_N_INPUTS)
-#endif
-        {
-            _FLOAT bot_dta = bot[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_o +
-                                 MLO_LRN_BOT_STRIDE * y + x];
-
-            _FLOAT prv_scale = scale_in[MLO_LRN_PAD];
-
-            _FLOAT exp_scale = exp(-beta * log(prv_scale));
-            //				pow(prv_scale,-beta);
-
-            _FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio;
-
-            _FLOAT out_val = top_df_in[MLO_LRN_PAD] * exp_scale - prv_accum_ratio;
-
-            bot_df_off = MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * c_o +
-                         MLO_LRN_BOTDF_STRIDE * y + x;
-
-            bot_df[bot_df_off] = out_val;
-        }
-    }
-
-    for(; c_i < MLO_LRN_N_INPUTS + MLO_LRN_PRE_PAD; c_i++, c_o++)
-    {
-
-        accum_ratio = accum_ratio - ratio_dta[0];
-
-        for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++)
-        {
-            top_df_in[i] = top_df_in[i + 1];
-            scale_in[i]  = scale_in[i + 1];
-            ratio_dta[i] = ratio_dta[i + 1];
-        }
-
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_o < MLO_LRN_N_INPUTS)
-#endif
-        {
-            _FLOAT bot_dta = bot[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_o +
-                                 MLO_LRN_BOT_STRIDE * y + x];
-
-            _FLOAT prv_scale = scale_in[MLO_LRN_PAD];
-
-            _FLOAT exp_scale = exp(-beta * log(prv_scale));
-            //				pow(prv_scale,-beta);
-
-            _FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio;
-
-            _FLOAT out_val = top_df_in[MLO_LRN_PAD] * exp_scale - prv_accum_ratio;
-
-            bot_df_off = MLO_LRN_BOTDF_BATCH_STRIDE * b + MLO_LRN_BOTDF_CHANNEL_STRIDE * c_o +
-                         MLO_LRN_BOTDF_STRIDE * y + x;
-
-            bot_df[bot_df_off] = out_val;
-        }
-    }
-}
diff --git a/projects/miopen/src/kernels/MIOpenLRNBwd.cpp b/projects/miopen/src/kernels/MIOpenLRNBwd.cpp
new file mode 100644
index 000000000000..bb23a3f7b324
--- /dev/null
+++ b/projects/miopen/src/kernels/MIOpenLRNBwd.cpp
@@ -0,0 +1,399 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#ifndef MIOPEN_HIP_RUNTIME_COMPILE
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "miopen_math.hpp"
+
+constexpr unsigned group_size_z      = 1u;
+constexpr unsigned local_data_width  = GROUP_SIZE_X * HORIZ_OUT_PIX + KERNEL_SIZE - 1;
+constexpr unsigned local_data_height = GROUP_SIZE_Y * VERT_OUT_PIX + KERNEL_SIZE - 1;
+constexpr bool low_channel_count     = N_INPUTS < KERNEL_SIZE;
+
+/*
+This is a naive implementation.
+The "sliding window" -based implementation is in MIOpenLRNFwd.cpp file
+*/
+__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__
+    void MIOpenLRNWithinChannelBwd(const FLOAT* top,
+                                   const FLOAT* bot,
+                                   const FLOAT* top_df,
+                                   const FLOAT* scale,
+                                   FLOAT* bot_df,
+                                   FLOAT /*ratio*/,
+                                   FLOAT alpha,
+                                   FLOAT beta)
+{
+    __shared__ FLOAT top_df_data[local_data_width * local_data_height];
+    __shared__ FLOAT ratio_data[local_data_width * local_data_height];
+    const int x          = blockIdx.x * GROUP_SIZE_X * HORIZ_OUT_PIX;
+    const int y          = blockIdx.y * GROUP_SIZE_Y * VERT_OUT_PIX;
+    const int lcl_id0    = threadIdx.x;
+    const int lcl_id1    = threadIdx.y;
+    const int ob         = blockIdx.z * group_size_z + threadIdx.z; // output * batch_sz
+    const int o          = ob / BATCH_SIZE;
+    const int b          = ob - o * BATCH_SIZE;
+    const int top_x      = x;
+    const int top_y      = y;
+    const int top_df_off = b * TOPDF_BATCH_STRIDE + o * TOPDF_CHANNEL_STRIDE;
+    const int scale_off  = b * SCALE_BATCH_STRIDE + o * SCALE_CHANNEL_STRIDE;
+    const int bot_x      = x + lcl_id0 * HORIZ_OUT_PIX;
+    const int bot_y      = y + lcl_id1 * VERT_OUT_PIX;
+
+// load top_diff and scale tiles
+#pragma unroll VERT_OUT_PIX
+    for(int b_j = lcl_id1; b_j < local_data_height; b_j += GROUP_SIZE_Y)
+    {
+        int top_y_act         = top_y + b_j - PAD;
+        const bool invisibleY = (top_y_act < 0) || (top_y_act >= TOP_HEIGHT);
+        top_y_act             = (invisibleY) ? 0 : top_y_act;
+
+        const int top_df_y_off = top_y_act * TOPDF_STRIDE;
+        const int scale_y_off  = top_y_act * SCALE_STRIDE;
+        const int lcl_off_v    = b_j * local_data_width;
+
+/*
+  Note: Code duplication from manual loop peeling
+
+  The HIP compiler does not unroll this loop if the loop statement is written as
+  `for(int b_i = lcl_id0; b_i < local_data_width; b_i += GROUP_SIZE_X)`, leading
+  to a loss in performance.
+
+  To enable unrolling the last loop iteration is manually peeled. As this is the
+  iteration that may not be uniformly executed by all work-items in a work-group.
+
+  Unfortunately there is code duplication in the body is the loop/if statements, as
+  refactoring it into a __alwaysinline__ free function seems inhibits compiler
+  optimizations.
+*/
+#pragma unroll
+        for(int i = 0; i < HORIZ_OUT_PIX; i++)
+        {
+            const int b_i         = i * GROUP_SIZE_X + lcl_id0;
+            int top_x_act         = top_x + b_i - PAD;
+            const bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH);
+            top_x_act             = (invisibleX) ? 0 : top_x_act;
+
+            FLOAT top_df_val = top_df[top_df_off + top_df_y_off + top_x_act];
+            FLOAT scale_val  = scale[scale_off + scale_y_off + top_x_act];
+
+            top_df_val = (invisibleX || invisibleY) ? 0 : top_df_val;
+            scale_val  = (invisibleX || invisibleY) ? FLOAT(1.f) : scale_val;
+
+            top_df_data[lcl_off_v + b_i] = top_df_val;
+            ratio_data[lcl_off_v + b_i]  = scale_val;
+        }
+        if(lcl_id0 < (KERNEL_SIZE - 1))
+        {
+            const int b_i         = GROUP_SIZE_X * HORIZ_OUT_PIX + lcl_id0;
+            int top_x_act         = top_x + b_i - PAD;
+            const bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH);
+            top_x_act             = (invisibleX) ? 0 : top_x_act;
+
+            FLOAT top_df_val = top_df[top_df_off + top_df_y_off + top_x_act];
+            FLOAT scale_val  = scale[scale_off + scale_y_off + top_x_act];
+
+            top_df_val = (invisibleX || invisibleY) ? 0 : top_df_val;
+            scale_val  = (invisibleX || invisibleY) ? FLOAT(1.f) : scale_val;
+
+            top_df_data[lcl_off_v + b_i] = top_df_val;
+            ratio_data[lcl_off_v + b_i]  = scale_val;
+        }
+    }
+
+    __syncthreads();
+
+    // actual top_diffs and scales
+    FLOAT prv_exp_scale[VERT_OUT_PIX][HORIZ_OUT_PIX];
+    for(int j = 0; j < VERT_OUT_PIX; ++j)
+    {
+        const int lcl_off_v = (lcl_id1 * VERT_OUT_PIX + PAD + j) * local_data_width;
+        for(int i = 0; i < HORIZ_OUT_PIX; i++)
+        {
+            const FLOAT scale_ratio     = ratio_data[lcl_off_v + lcl_id0 * HORIZ_OUT_PIX + PAD + i];
+            const FLOAT scale_ratio_log = miopen::detail::log(scale_ratio);
+            prv_exp_scale[j][i]         = miopen::detail::exp(-beta * scale_ratio_log);
+        }
+    }
+
+    __syncthreads();
+
+    // read top and load ratio tile
+    const int top_off = b * TOP_BATCH_STRIDE + o * TOP_CHANNEL_STRIDE;
+#pragma unroll VERT_OUT_PIX
+    for(int b_j = lcl_id1; b_j < local_data_height; b_j += GROUP_SIZE_Y)
+    {
+        int top_y_act         = top_y + b_j - PAD;
+        const bool invisibleY = (top_y_act < 0) || (top_y_act >= TOP_HEIGHT);
+        top_y_act             = (invisibleY) ? 0 : top_y_act;
+
+        const int top_y_off = top_y_act * TOP_STRIDE;
+        const int lcl_off_v = b_j * local_data_width;
+/*
+  Note: Code duplication from manual loop peeling
+
+  The HIP compiler does not unroll this loop if the loop statement is written as
+  `for(int b_i = lcl_id0; b_i < local_data_width; b_i += GROUP_SIZE_X)`, leading
+  to a loss in performance.
+
+  To enable unrolling the last loop iteration is manually peeled. As this is the
+  iteration that may not be uniformly executed by all work-items in a work-group.
+
+  Unfortunately there is code duplication in the body is the loop/if statements, as
+  refactoring it into a __alwaysinline__ free function seems inhibits compiler
+  optimizations.
+*/
+#pragma unroll
+        for(int i = 0; i < HORIZ_OUT_PIX; i++)
+        {
+            int b_i         = i * GROUP_SIZE_X + lcl_id0;
+            int top_x_act   = top_x + b_i - PAD;
+            bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH);
+            top_x_act       = (invisibleX) ? 0 : top_x_act;
+
+            FLOAT top_val = top[top_off + top_y_off + top_x_act];
+            top_val       = (invisibleX || invisibleY) ? 0 : top_val;
+
+            const FLOAT top_df_val = top_df_data[lcl_off_v + b_i];
+            const FLOAT scale_val  = ratio_data[lcl_off_v + b_i];
+
+            // scale val is not 0
+            FLOAT ratio_dta = (top_df_val * top_val) / scale_val;
+            // replacing scale with ratio
+            ratio_data[lcl_off_v + b_i] = ratio_dta;
+        }
+        if(lcl_id0 < (KERNEL_SIZE - 1))
+        {
+            int b_i         = GROUP_SIZE_X * HORIZ_OUT_PIX + lcl_id0;
+            int top_x_act   = top_x + b_i - PAD;
+            bool invisibleX = (top_x_act < 0) || (top_x_act >= TOP_WIDTH);
+            top_x_act       = (invisibleX) ? 0 : top_x_act;
+
+            FLOAT top_val = top[top_off + top_y_off + top_x_act];
+            top_val       = (invisibleX || invisibleY) ? 0 : top_val;
+
+            const FLOAT top_df_val = top_df_data[lcl_off_v + b_i];
+            const FLOAT scale_val  = ratio_data[lcl_off_v + b_i];
+
+            // scale val is not 0
+            FLOAT ratio_dta = (top_df_val * top_val) / scale_val;
+            // replacing scale with ratio
+            ratio_data[lcl_off_v + b_i] = ratio_dta;
+        }
+    }
+
+    __syncthreads();
+
+    // caculate bot diff
+    FLOAT prv_bot_diff[VERT_OUT_PIX][HORIZ_OUT_PIX];
+    for(int j = 0; j < VERT_OUT_PIX; ++j)
+    {
+        const int v_off_v = (lcl_id1 * VERT_OUT_PIX + j);
+        const int hstart  = y + v_off_v - PAD;
+        const int hend    = min(hstart + KERNEL_SIZE, TOP_HEIGHT + PRE_PAD);
+
+        // value offset, vertical
+        int lcl_v_off_v = (v_off_v + PAD) * local_data_width;
+        for(int i = 0; i < HORIZ_OUT_PIX; i++)
+        {
+            FLOAT prv_ratio_accum(0);
+            const int v_off_h = lcl_id0 * HORIZ_OUT_PIX + i;
+
+            const int wstart        = x + v_off_h - PAD;
+            const int wend          = min(wstart + KERNEL_SIZE, TOP_WIDTH + PRE_PAD);
+            const int adj_area_size = (hend - hstart) * (wend - wstart);
+
+            // accum offset, horiz
+            const int lcl_a_off_h = v_off_h;
+            //	value offset, horiz
+            const int lcl_v_off_h = lcl_a_off_h + PAD;
+            for(int k = 0; k < KERNEL_SIZE; k++)
+            {
+                for(int l = 0; l < KERNEL_SIZE; l++)
+                {
+                    prv_ratio_accum +=
+                        ratio_data[(v_off_v + k) * local_data_width + lcl_a_off_h + l];
+                }
+            }
+
+            const FLOAT top_df_val  = top_df_data[lcl_v_off_v + lcl_v_off_h];
+            const unsigned bot_off0 = BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * o +
+                                      BOT_STRIDE * (y + v_off_v) + x + v_off_h;
+
+            const unsigned bot_off = (y + v_off_v < BOT_HEIGHT && x + v_off_h < BOT_WIDTH &&
+                                      b < BATCH_SIZE && o < OUT_CHANNELS)
+                                         ? bot_off0
+                                         : BATCH_SIZE * BOT_BATCH_STRIDE - 1;
+
+            const FLOAT bot_dta         = (y + v_off_v < BOT_HEIGHT && x + v_off_h < BOT_WIDTH &&
+                                   b < BATCH_SIZE && o < OUT_CHANNELS)
+                                              ? bot[bot_off]
+                                              : 0;
+            const FLOAT adj_ratio       = FLOAT(2.f) * alpha * beta / adj_area_size;
+            const FLOAT prv_accum_ratio = adj_ratio * bot_dta * prv_ratio_accum;
+            prv_bot_diff[j][i]          = prv_exp_scale[j][i] * top_df_val - prv_accum_ratio;
+        }
+    }
+
+    for(int j = 0; j < VERT_OUT_PIX; j++)
+    {
+        for(int i = 0; i < HORIZ_OUT_PIX; i++)
+        {
+            if(bot_y + j < BOT_HEIGHT && bot_x + i < BOT_WIDTH && b < BATCH_SIZE &&
+               o < OUT_CHANNELS)
+            {
+                bot_df[BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * o +
+                       BOTDF_STRIDE * (bot_y + j) + bot_x + i] = prv_bot_diff[j][i];
+            }
+        }
+    }
+}
+
+__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__
+    void MIOpenLRNAcrossChannelsBwd1(const FLOAT* top,
+                                     const FLOAT* bot,
+                                     const FLOAT* top_df,
+                                     const FLOAT* scale,
+                                     FLOAT* bot_df,
+                                     FLOAT ratio,
+                                     FLOAT /* alpha */,
+                                     FLOAT beta)
+{
+    const int x       = blockIdx.x * GROUP_SIZE_X + threadIdx.x; // channel x
+    const int y       = blockIdx.y * GROUP_SIZE_Y + threadIdx.y; // channel y
+    const int b       = blockIdx.z * group_size_z + threadIdx.z; // batch
+    FLOAT accum_ratio = 0;
+    FLOAT top_df_in[KERNEL_SIZE];
+    FLOAT scale_in[KERNEL_SIZE];
+    FLOAT ratio_dta[KERNEL_SIZE];
+    int c_i = 0, c_o = 0;
+
+    for(; c_i < PRE_PAD; c_i++)
+    {
+        top_df_in[c_i] =
+            top_df[TOPDF_BATCH_STRIDE * b + TOPDF_CHANNEL_STRIDE * c_i + TOPDF_STRIDE * y + x];
+        scale_in[c_i] =
+            scale[SCALE_BATCH_STRIDE * b + SCALE_CHANNEL_STRIDE * c_i + SCALE_STRIDE * y + x];
+        const FLOAT top_dta =
+            top[TOP_BATCH_STRIDE * b + TOP_CHANNEL_STRIDE * c_i + TOP_STRIDE * y + x];
+
+        ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i];
+
+        if constexpr(low_channel_count)
+        {
+            ratio_dta[c_i] = (c_i < OUT_CHANNELS) ? ratio_dta[c_i] : 0;
+        }
+
+        accum_ratio = accum_ratio + ratio_dta[c_i];
+    }
+
+    for(; c_i < KERNEL_SIZE; c_i++, c_o++)
+    {
+        top_df_in[c_i] =
+            top_df[TOPDF_BATCH_STRIDE * b + TOPDF_CHANNEL_STRIDE * c_i + TOPDF_STRIDE * y + x];
+        scale_in[c_i] =
+            scale[SCALE_BATCH_STRIDE * b + SCALE_CHANNEL_STRIDE * c_i + SCALE_STRIDE * y + x];
+        const FLOAT top_dta =
+            top[TOP_BATCH_STRIDE * b + TOP_CHANNEL_STRIDE * c_i + TOP_STRIDE * y + x];
+        ratio_dta[c_i] = (top_df_in[c_i] * top_dta) / scale_in[c_i];
+        if constexpr(low_channel_count)
+        {
+            ratio_dta[c_i] = (c_i < OUT_CHANNELS) ? ratio_dta[c_i] : 0;
+        }
+
+        accum_ratio = accum_ratio + ratio_dta[c_i];
+        if(!low_channel_count || c_o < N_INPUTS)
+        {
+            const FLOAT bot_dta =
+                bot[BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * c_o + BOT_STRIDE * y + x];
+
+            const FLOAT prv_scale       = scale_in[c_o];
+            const FLOAT prv_scale_log   = miopen::detail::log(prv_scale);
+            const FLOAT exp_scale       = miopen::detail::exp(-beta * prv_scale_log);
+            const FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio;
+
+            const FLOAT out_val = top_df_in[c_o] * exp_scale - prv_accum_ratio;
+
+            const int bot_df_off =
+                BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * c_o + BOTDF_STRIDE * y + x;
+
+            bot_df[bot_df_off] = out_val;
+        }
+    }
+
+    for(; c_i < N_INPUTS; c_i++, c_o++)
+    {
+        const FLOAT prv_top_df_in =
+            top_df[TOPDF_BATCH_STRIDE * b + TOPDF_CHANNEL_STRIDE * c_i + TOPDF_STRIDE * y + x];
+        const FLOAT prv_scale_in =
+            scale[SCALE_BATCH_STRIDE * b + SCALE_CHANNEL_STRIDE * c_i + SCALE_STRIDE * y + x];
+        const FLOAT top_dta =
+            top[TOP_BATCH_STRIDE * b + TOP_CHANNEL_STRIDE * c_i + TOP_STRIDE * y + x];
+        FLOAT prv_ratio_dta = prv_top_df_in * top_dta / prv_scale_in;
+        if constexpr(low_channel_count)
+        {
+            prv_ratio_dta = (c_i < OUT_CHANNELS) ? prv_ratio_dta : 0;
+        }
+
+        accum_ratio = accum_ratio + prv_ratio_dta;
+        accum_ratio = accum_ratio - ratio_dta[0];
+
+        for(int i = 0; i < KERNEL_SIZE - 1; i++)
+        {
+            top_df_in[i] = top_df_in[i + 1];
+            scale_in[i]  = scale_in[i + 1];
+            ratio_dta[i] = ratio_dta[i + 1];
+        }
+
+        top_df_in[KERNEL_SIZE - 1] = prv_top_df_in;
+        scale_in[KERNEL_SIZE - 1]  = prv_scale_in;
+        ratio_dta[KERNEL_SIZE - 1] = prv_ratio_dta;
+
+        if(!low_channel_count || c_o < N_INPUTS)
+        {
+            const FLOAT bot_dta =
+                bot[BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * c_o + BOT_STRIDE * y + x];
+
+            const FLOAT prv_scale = scale_in[PAD];
+            const FLOAT exp_scale = miopen::detail::exp(-beta * miopen::detail::log(prv_scale));
+            const FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio;
+            const FLOAT out_val         = top_df_in[PAD] * exp_scale - prv_accum_ratio;
+
+            const int bot_df_off =
+                BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * c_o + BOTDF_STRIDE * y + x;
+
+            bot_df[bot_df_off] = out_val;
+        }
+    }
+
+    for(; c_i < N_INPUTS + PRE_PAD; c_i++, c_o++)
+    {
+        accum_ratio = accum_ratio - ratio_dta[0];
+
+        for(int i = 0; i < KERNEL_SIZE - 1; i++)
+        {
+            top_df_in[i] = top_df_in[i + 1];
+            scale_in[i]  = scale_in[i + 1];
+            ratio_dta[i] = ratio_dta[i + 1];
+        }
+
+        if(!low_channel_count || c_o < N_INPUTS)
+        {
+            const FLOAT bot_dta =
+                bot[BOT_BATCH_STRIDE * b + BOT_CHANNEL_STRIDE * c_o + BOT_STRIDE * y + x];
+
+            const FLOAT prv_scale = scale_in[PAD];
+            const FLOAT exp_scale = miopen::detail::exp(-beta * miopen::detail::log(prv_scale));
+            const FLOAT prv_accum_ratio = ratio * bot_dta * accum_ratio;
+            const FLOAT out_val         = top_df_in[PAD] * exp_scale - prv_accum_ratio;
+
+            const int bot_df_off =
+                BOTDF_BATCH_STRIDE * b + BOTDF_CHANNEL_STRIDE * c_o + BOTDF_STRIDE * y + x;
+
+            bot_df[bot_df_off] = out_val;
+        }
+    }
+}
diff --git a/projects/miopen/src/kernels/MIOpenLRNFwd.cl b/projects/miopen/src/kernels/MIOpenLRNFwd.cl
deleted file mode 100644
index 093b86f6dd08..000000000000
--- a/projects/miopen/src/kernels/MIOpenLRNFwd.cl
+++ /dev/null
@@ -1,713 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#define PPCAT_NX(A, B) A##B
-#define PPCAT(A, B) PPCAT_NX(A, B)
-#define TWO 2
-#define THREE 3
-#define FOUR 4
-#define EIGHT 8
-
-#if MIOPEN_USE_FP16 == 1
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define _FLOAT half
-#endif
-#if MIOPEN_USE_FP32 == 1
-#define _FLOAT float
-#endif
-
-#define _FLOAT2 PPCAT(_FLOAT, TWO)
-#define _FLOAT3 PPCAT(_FLOAT, THREE)
-#define _FLOAT4 PPCAT(_FLOAT, FOUR)
-#define _FLOAT8 PPCAT(_FLOAT, EIGHT)
-
-#define DBG_OUT 0
-
-#define UNUSED __attribute__((__unused__))
-
-#define MLO_LRN_GROUP_SZ2 1
-#define MLO_LRN_STRIDE 1
-
-#define MLO_LRN_LEFT_PAD0 (((MLO_LRN_PRE_PAD0 + MLO_READ_UNIT - 1) / MLO_READ_UNIT) * MLO_READ_UNIT)
-#define MLO_LRN_RIGHT_SIDE                                                               \
-    (((MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_PAD0 + MLO_READ_UNIT - 1) / \
-      MLO_READ_UNIT) *                                                                   \
-     MLO_READ_UNIT)
-#define MLO_LRN_LCL_DATA_WIDTH (MLO_LRN_LEFT_PAD0 + MLO_LRN_RIGHT_SIDE)
-#define MLO_LCL_READ4 (MLO_LRN_LCL_DATA_WIDTH / MLO_READ_UNIT)
-#define MLO_LRN_LCL_DATA_HEIGHT (MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX + MLO_LRN_KERNEL_SZ - 1)
-#define MLO_LRN_GROUP_SZ (MLO_LRN_GROUP_SZ2 * MLO_LRN_GROUP_SZ1 * MLO_LRN_GROUP_SZ0)
-// #define MLO_LRN_PREPAD_SZ (MLO_LRN_KERNEL_SZ - 1)/2
-
-struct LRNForwardParam
-{
-    _FLOAT alphaoverarea;
-    _FLOAT alpha;
-    _FLOAT beta;
-    _FLOAT K;
-};
-
-#include "math_ops.h"
-__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2)))
-__kernel void
-MIOpenLRNWithinChannel_PS(const __global _FLOAT* bot,
-                          __global _FLOAT* top,
-#if MLO_LRN_DO_SCALE
-                          __global _FLOAT* scale,
-#endif
-                          _FLOAT alphaoverarea,
-                          UNUSED _FLOAT alpha,
-                          _FLOAT beta,
-                          _FLOAT K)
-{
-    // IT's taken from POOLING AVE with stride = 1'
-    __local _FLOAT bot_data[MLO_LRN_LCL_DATA_WIDTH * MLO_LRN_LCL_DATA_HEIGHT];
-    int x       = get_group_id(0) * MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX;
-    int y       = get_group_id(1) * MLO_LRN_GROUP_SZ1 * MLO_LRN_N_VERT_OUT_PIX;
-    int lcl_id0 = get_local_id(0);
-    int lcl_id1 = get_local_id(1);
-    int ob      = get_global_id(2); // output * batch_sz
-    int o       = iDiv_legacy(ob, MLO_LRN_BATCH_SZ);
-    int b       = iMod(ob, o, MLO_LRN_BATCH_SZ);
-    int bot_x   = x;
-    int bot_y   = y;
-    int bot_off = b * MLO_LRN_BOT_BATCH_STRIDE + o * MLO_LRN_BOT_CHANNEL_STRIDE;
-
-    // load tile
-    for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1)
-    {
-        int bot_y_act = bot_y + b_j - MLO_LRN_PRE_PAD1;
-
-        bool invisibleY = (bot_y_act < 0) || (bot_y_act >= MLO_LRN_BOT_HEIGHT);
-
-        int bot_y_off = bot_y_act * MLO_LRN_BOT_STRIDE;
-
-        int lcl_off_v = mul24(b_j, (int)MLO_LRN_LCL_DATA_WIDTH);
-
-        for(int b_i = lcl_id0; b_i < MLO_LCL_READ4; b_i += MLO_LRN_GROUP_SZ0)
-        {
-
-            int bot_x_act = bot_x + (b_i * MLO_READ_UNIT) - MLO_LRN_LEFT_PAD0;
-
-            bool invisibleX;
-            for(int i = 0; i < MLO_READ_UNIT; ++i)
-            {
-
-                int bot_off_x = bot_off + bot_y_off + bot_x_act + i;
-
-                invisibleX = (bot_x_act + i < 0) || (bot_x_act + i >= MLO_LRN_BOT_WIDTH);
-
-                bot_off_x = (invisibleX || invisibleY) ? 0 : bot_off_x;
-
-                _FLOAT bot_val = bot[bot_off_x];
-
-                bot_val = (invisibleX || invisibleY) ? 0 : bot_val;
-
-                bot_data[lcl_off_v + (b_i * MLO_READ_UNIT) + i] = bot_val;
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-    _FLOAT partial_sum_x[MLO_LRN_N_HORIZ_OUT_PIX - 1]; // horizontal partial sum
-#endif
-#if MLO_LRN_N_VERT_OUT_PIX > 1
-    _FLOAT partial_sum_xy[MLO_LRN_N_VERT_OUT_PIX - 1]
-                         [MLO_LRN_N_HORIZ_OUT_PIX]; // horizontal-vertical partial sums.
-#endif
-    _FLOAT accum[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX]; // accumulator
-
-    int top_y = mad24(lcl_id1, (int)MLO_LRN_N_VERT_OUT_PIX, y);
-    int top_x = mad24(lcl_id0, (int)MLO_LRN_N_HORIZ_OUT_PIX, x);
-
-    int lcl_y = mul24(lcl_id1, (int)MLO_LRN_N_VERT_OUT_PIX);
-    int lcl_x =
-        mad24(lcl_id0, (int)(MLO_LRN_N_HORIZ_OUT_PIX), (int)(MLO_LRN_LEFT_PAD0 - MLO_LRN_PRE_PAD0));
-    int lcl_off = mad24(lcl_y, MLO_LRN_LCL_DATA_WIDTH, lcl_x);
-
-    for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j)
-    {
-        for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; ++i)
-        {
-            accum[j][i] = 0;
-        }
-    }
-#if MLO_LRN_N_VERT_OUT_PIX > 1
-    for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX - 1; ++j)
-    {
-        for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; ++i)
-        {
-            partial_sum_xy[j][i] = 0;
-        }
-    }
-#endif
-
-    // running window  summation
-    _FLOAT mov_accum;
-    int jj = 0;
-    int ii = 0;
-
-    // first to get vertica partial sums
-
-#if MLO_LRN_N_VERT_OUT_PIX > 1
-    for(; jj < (int)(MLO_LRN_N_VERT_OUT_PIX - 1); ++jj)
-    {
-        for(ii = 0; ii < (int)(MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            _FLOAT bot_val = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-
-            _FLOAT accum_tmp = bot_val * bot_val;
-
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-            // save horizontal partial sums
-            partial_sum_x[ii] = accum_tmp;
-#endif
-            // accumulate in vert-horizontal(0)
-            partial_sum_xy[jj][0] += accum_tmp;
-        }
-
-        for(; ii < (int)MLO_LRN_KERNEL_SZ0; ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-            // accumulate in vert horizontal(0)
-            partial_sum_xy[jj][0] += accum_tmp;
-        }
-
-        // running horizontal window
-
-        for(; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-            // calculate all vertical-horizontal partial sums
-            partial_sum_xy[jj][ii - MLO_LRN_KERNEL_SZ0 + 1] =
-                partial_sum_xy[jj][ii - MLO_LRN_KERNEL_SZ0] +
-                (accum_tmp
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-                 - partial_sum_x[ii - MLO_LRN_KERNEL_SZ0]
-#endif
-                );
-        }
-
-        // put into accumulator[0][i]
-        // whatever has been accumulated so far
-        for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; ++i)
-        {
-            accum[0][i] += partial_sum_xy[jj][i];
-        }
-    }
-#endif
-
-    // calculate row 0 accumulators
-    for(; jj < (int)MLO_LRN_KERNEL_SZ1; ++jj)
-    {
-        mov_accum = 0;
-
-        for(ii = 0; ii < (int)(MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-            partial_sum_x[ii] = accum_tmp;
-#endif
-            mov_accum += accum_tmp;
-        }
-
-        for(; ii < (int)MLO_LRN_KERNEL_SZ0; ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-            mov_accum += accum_tmp;
-        }
-
-        accum[0][0] += mov_accum;
-        // running horizontal window
-
-        for(; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-            // running horizontal window
-            mov_accum += (accum_tmp
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-                          - partial_sum_x[ii - MLO_LRN_KERNEL_SZ0]
-#endif
-            );
-            accum[0][ii - MLO_LRN_KERNEL_SZ0 + 1] += mov_accum;
-        }
-    }
-
-    // accumulate all other rows besides 0
-    for(; jj < (int)(MLO_LRN_KERNEL_SZ1 + MLO_LRN_N_VERT_OUT_PIX - 1); ++jj)
-    {
-        // first running horizontal winodw as before
-        mov_accum = 0;
-        for(ii = 0; ii < (int)(MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-            partial_sum_x[ii] = accum_tmp;
-#endif
-            accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] += accum_tmp;
-        }
-        for(; ii < (int)MLO_LRN_KERNEL_SZ0; ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-            accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] += accum_tmp;
-        }
-        // running horizontal window
-
-        int ii1 = ii;
-        for(; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            _FLOAT bot_val   = bot_data[lcl_off + jj * MLO_LRN_LCL_DATA_WIDTH + ii];
-            _FLOAT accum_tmp = bot_val * bot_val;
-            //
-            accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] =
-                accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0] + accum_tmp;
-#if MLO_LRN_N_HORIZ_OUT_PIX > 1
-            accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] -=
-                partial_sum_x[ii - MLO_LRN_KERNEL_SZ0];
-#endif
-        }
-
-        // finally running vertical window
-
-        for(ii = ii1; ii < (int)(MLO_LRN_KERNEL_SZ0 + MLO_LRN_N_HORIZ_OUT_PIX - 1); ++ii)
-        {
-
-            // finish horizontal summation
-            // add/substarct vertical patial sum
-            accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] +=
-                accum[jj - MLO_LRN_KERNEL_SZ1][ii - MLO_LRN_KERNEL_SZ0 + 1];
-#if MLO_LRN_N_VERT_OUT_PIX > 1
-            accum[jj - MLO_LRN_KERNEL_SZ1 + 1][ii - MLO_LRN_KERNEL_SZ0 + 1] -=
-                partial_sum_xy[jj - MLO_LRN_KERNEL_SZ1][ii - MLO_LRN_KERNEL_SZ0 + 1];
-#endif
-        }
-#if MLO_LRN_N_VERT_OUT_PIX > 1
-        accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] -= partial_sum_xy[jj - MLO_LRN_KERNEL_SZ1][0];
-#endif
-        accum[jj - MLO_LRN_KERNEL_SZ1 + 1][0] += accum[jj - MLO_LRN_KERNEL_SZ1][0];
-    }
-
-    // normalization
-    _FLOAT prv_scale[MLO_LRN_N_VERT_OUT_PIX][MLO_LRN_N_HORIZ_OUT_PIX];
-    _FLOAT adj_alphaoverarea = alphaoverarea;
-    for(int k = 0; k < MLO_LRN_N_VERT_OUT_PIX; k++)
-    {
-
-        //			int hstart = y + lcl_id1 * MLO_LRN_N_VERT_OUT_PIX  + k -
-        // MLO_LRN_PAD1;
-        //			int hend = min(hstart + MLO_LRN_KERNEL_SZ, MLO_LRN_BOT_HEIGHT +
-        // MLO_LRN_PAD1);
-
-        for(int l = 0; l < MLO_LRN_N_HORIZ_OUT_PIX; l++)
-        {
-
-            //				int wstart = x + lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + l -
-            // MLO_LRN_PAD0;
-            //				int wend = min(wstart + MLO_LRN_KERNEL_SZ, MLO_LRN_BOT_WIDTH
-            //+
-            // MLO_LRN_PAD0);
-            //				int adj_area_size = (hend - hstart) * (wend - wstart);
-            //				adj_alphaoverarea = alpha / adj_area_size;
-
-            prv_scale[k][l] = K + accum[k][l] * adj_alphaoverarea;
-        }
-    }
-
-    int top_off = b * MLO_LRN_TOP_BATCH_STRIDE + o * MLO_LRN_TOP_CHANNEL_STRIDE +
-                  top_y * MLO_LRN_TOP_STRIDE + top_x;
-#if MLO_LRN_DO_SCALE
-    int scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + o * MLO_LRN_SCALE_CHANNEL_STRIDE +
-                    top_y * MLO_LRN_SCALE_STRIDE + top_x;
-#endif
-
-    // final output
-
-    for(int k = 0; k < MLO_LRN_N_VERT_OUT_PIX
-#if MLO_OUT_VERT_ALIGNED == 0
-                   && (top_y + k < MLO_LRN_TOP_HEIGHT)
-#endif
-            ;
-        k++)
-    {
-        for(int l = 0; l < MLO_LRN_N_HORIZ_OUT_PIX
-#if MLO_OUT_HORIZ_ALIGNED == 0
-                       && (top_x + l < MLO_LRN_TOP_WIDTH)
-#endif
-                ;
-            l++)
-        {
-            _FLOAT s;
-            s = exp((_FLOAT)-beta * log(prv_scale[k][l]));
-            //					s = pow(prv_scale[k][l], -beta);
-            _FLOAT bot_val = bot_data[lcl_off + mad24((k + MLO_LRN_PRE_PAD1),
-                                                      (int)MLO_LRN_LCL_DATA_WIDTH,
-                                                      (l + MLO_LRN_PRE_PAD0))];
-#if MLO_LRN_DO_SCALE
-            scale[scale_off + k * MLO_LRN_SCALE_STRIDE + l] = prv_scale[k][l];
-#endif
-            top[top_off + k * MLO_LRN_TOP_STRIDE + l] = bot_val * s;
-        }
-    }
-}
-
-#if(MLO_LRN_N_INPUTS < MLO_LRN_KERNEL_SZ)
-#define MLO_LOW_CHNL_COUNT 1
-#else
-#define MLO_LOW_CHNL_COUNT 0
-#endif
-__attribute__((reqd_work_group_size(MLO_LRN_GROUP_SZ0, MLO_LRN_GROUP_SZ1, MLO_LRN_GROUP_SZ2)))
-__kernel void
-MIOpenLRNAcrossChannels4(const __global _FLOAT* bottom,
-                         __global _FLOAT* top,
-#if MLO_LRN_DO_SCALE
-                         __global _FLOAT* scale,
-#endif
-                         _FLOAT alphaoverarea,
-                         UNUSED _FLOAT alpha,
-                         _FLOAT beta,
-                         _FLOAT K)
-{
-    int pix_id          = get_global_id(0); //
-    int b               = get_global_id(2); // batch
-    MLO_READ_TYPE accum = 0;
-    MLO_READ_TYPE bot_in2[MLO_LRN_KERNEL_SZ];
-    MLO_READ_TYPE bot_in[MLO_LRN_KERNEL_SZ];
-    int c_i = 0, c_o = 0;
-    for(int i = 0; i < MLO_LRN_KERNEL_SZ; ++i)
-    {
-        bot_in2[i] = 0;
-        bot_in[i]  = 0;
-    }
-
-    int top_off = 0;
-#if MLO_LRN_DO_SCALE
-    int scale_off;
-#endif
-
-    for(c_i = 0; c_i < MLO_LRN_PAD; c_i++)
-    {
-        MLO_READ_TYPE prv_in;
-        prv_in = 0;
-
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_i < MLO_LRN_N_INPUTS)
-#endif
-        {
-#if MLO_C1x1_PIXLEFT > 0
-            // if the last one
-            if(pix_id == MLO_MAP_SZ4 - 1)
-            {
-
-                for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j)
-                {
-                    ((_FLOAT*)&prv_in)[j] =
-                        bottom[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_i +
-                               (pix_id * MLO_READ_UNIT) + j];
-                }
-            }
-            else
-#endif
-            {
-                prv_in = *(__global MLO_READ_TYPE*)&bottom[MLO_LRN_BOT_BATCH_STRIDE * b +
-                                                           MLO_LRN_BOT_CHANNEL_STRIDE * c_i +
-                                                           (pix_id * MLO_READ_UNIT)];
-            }
-        }
-
-        bot_in2[c_i] = prv_in * prv_in;
-        bot_in[c_i]  = prv_in;
-        accum        = accum + bot_in2[c_i];
-        //				fma(bot_in2[c_i + MLO_LRN_PAD], bot_in2[c_i + MLO_LRN_PAD],
-        // accum);
-    }
-
-    for(; c_i < MLO_LRN_KERNEL_SZ; c_i++, c_o++)
-    {
-        MLO_READ_TYPE prv_in;
-        prv_in = 0;
-
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_i < MLO_LRN_N_INPUTS)
-#endif
-        {
-
-#if MLO_C1x1_PIXLEFT > 0
-            // if the last one
-            if(pix_id == MLO_MAP_SZ4 - 1)
-            {
-
-                for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j)
-                {
-                    ((_FLOAT*)&prv_in)[j] =
-                        bottom[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_i +
-                               (pix_id * MLO_READ_UNIT) + j];
-                }
-            }
-            else
-#endif
-            {
-                prv_in = *(__global MLO_READ_TYPE*)&bottom[MLO_LRN_BOT_BATCH_STRIDE * b +
-                                                           MLO_LRN_BOT_CHANNEL_STRIDE * c_i +
-                                                           (pix_id * MLO_READ_UNIT)];
-            }
-        }
-
-        bot_in2[c_i] = prv_in * prv_in;
-        bot_in[c_i]  = prv_in;
-        accum        = accum + bot_in2[c_i];
-
-        top_off = b * MLO_LRN_TOP_BATCH_STRIDE + c_o * MLO_LRN_TOP_CHANNEL_STRIDE +
-                  (pix_id * MLO_READ_UNIT);
-#if MLO_LRN_DO_SCALE
-        scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + c_o * MLO_LRN_SCALE_CHANNEL_STRIDE +
-                    (pix_id * MLO_READ_UNIT);
-#endif
-        MLO_READ_TYPE prv_scale = ((MLO_READ_TYPE)K + accum * (MLO_READ_TYPE)alphaoverarea);
-        //				fma(accum,alphaoverarea, (_FLOAT)1.f);
-
-        MLO_READ_TYPE exp_scale = exp((MLO_READ_TYPE)-beta * log(prv_scale));
-        //				pow(prv_scale,-beta);
-        // bug
-        //	MLO_READ_TYPE prv_out = sqrt(bot_in2[c_o]);
-        MLO_READ_TYPE prv_out = bot_in[c_o];
-        MLO_READ_TYPE out_val = prv_out * exp_scale;
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_o < MLO_LRN_N_OUTPUTS)
-#endif
-        {
-
-#if MLO_C1x1_PIXLEFT > 0
-
-            // if the last one
-            if(pix_id == MLO_MAP_SZ4 - 1)
-            {
-                for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j)
-                {
-                    top[top_off + j] = ((_FLOAT*)&out_val)[j];
-#if DBG_OUT
-                    printf("K:o0: %d %f %f %f %f %f\n",
-                           top_off + j,
-                           top[top_off + j],
-                           ((_FLOAT*)&prv_out)[j],
-                           ((_FLOAT*)&exp_scale)[j],
-                           ((_FLOAT*)&prv_scale)[j],
-                           ((_FLOAT*)&accum)[j]);
-#endif
-
-#if MLO_LRN_DO_SCALE
-                    scale[scale_off + j] = ((_FLOAT*)&prv_scale)[j];
-#endif
-                }
-            }
-            else
-#endif
-            {
-
-                *((__global MLO_READ_TYPE*)&top[top_off]) = out_val;
-#if MLO_LRN_DO_SCALE
-                *((__global MLO_READ_TYPE*)&scale[scale_off]) = prv_scale;
-#endif
-            }
-        }
-    }
-
-    for(; c_i < MLO_LRN_N_INPUTS; c_i++, c_o++)
-    {
-
-        MLO_READ_TYPE prv_in;
-        prv_in = 0;
-
-#if MLO_C1x1_PIXLEFT > 0
-        // if the last one
-        if(pix_id == MLO_MAP_SZ4 - 1)
-        {
-
-            for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j)
-            {
-                ((_FLOAT*)&prv_in)[j] =
-                    bottom[MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * c_i +
-                           (pix_id * MLO_READ_UNIT) + j];
-            }
-        }
-        else
-#endif
-        {
-            prv_in = *(__global MLO_READ_TYPE*)&bottom[MLO_LRN_BOT_BATCH_STRIDE * b +
-                                                       MLO_LRN_BOT_CHANNEL_STRIDE * c_i +
-                                                       (pix_id * MLO_READ_UNIT)];
-        }
-
-        MLO_READ_TYPE prv_bot_in2 = prv_in * prv_in;
-        accum                     = accum + prv_bot_in2;
-
-        accum = accum - bot_in2[0];
-        //				fma(-bot_in2[0], bot_in2[0], accum);
-
-        for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++)
-        {
-            bot_in2[i] = bot_in2[i + 1];
-            bot_in[i]  = bot_in[i + 1];
-        }
-
-        bot_in2[MLO_LRN_KERNEL_SZ - 1] = prv_bot_in2;
-        bot_in[MLO_LRN_KERNEL_SZ - 1]  = prv_in;
-
-        top_off = b * MLO_LRN_TOP_BATCH_STRIDE + c_o * MLO_LRN_TOP_CHANNEL_STRIDE +
-                  (pix_id * MLO_READ_UNIT);
-#if MLO_LRN_DO_SCALE
-        scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + c_o * MLO_LRN_SCALE_CHANNEL_STRIDE +
-                    (pix_id * MLO_READ_UNIT);
-#endif
-        MLO_READ_TYPE prv_scale = ((MLO_READ_TYPE)K + accum * (MLO_READ_TYPE)alphaoverarea);
-        //				fma(accum,alphaoverarea, (_FLOAT)1.f);
-
-        MLO_READ_TYPE exp_scale = exp((MLO_READ_TYPE)-beta * log(prv_scale));
-        //				pow(prv_scale,-beta);
-        // bug
-        //			MLO_READ_TYPE prv_out = sqrt(bot_in2[MLO_LRN_PRE_PAD]);
-        MLO_READ_TYPE prv_out = bot_in[MLO_LRN_PRE_PAD];
-        MLO_READ_TYPE out_val = prv_out * exp_scale;
-
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_o < MLO_LRN_N_OUTPUTS)
-#endif
-        {
-
-#if MLO_C1x1_PIXLEFT > 0
-
-            // if the last one
-            if(pix_id == MLO_MAP_SZ4 - 1)
-            {
-                for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j)
-                {
-                    top[top_off + j] = ((_FLOAT*)&out_val)[j];
-#if DBG_OUT
-                    printf("K:o1: %d %f %f %f\n",
-                           top_off + j,
-                           top[top_off + j],
-                           ((_FLOAT*)&prv_out)[j],
-                           ((_FLOAT*)&exp_scale)[j]);
-#endif
-
-#if MLO_LRN_DO_SCALE
-                    scale[scale_off + j] = ((_FLOAT*)&prv_scale)[j];
-#endif
-                }
-            }
-            else
-#endif
-            {
-
-                *((__global MLO_READ_TYPE*)&top[top_off]) = out_val;
-#if MLO_LRN_DO_SCALE
-                *((__global MLO_READ_TYPE*)&scale[scale_off]) = prv_scale;
-#endif
-            }
-        }
-    }
-
-    for(; c_i < MLO_LRN_N_INPUTS + MLO_LRN_PAD; c_i++, c_o++)
-    {
-
-        accum = accum - bot_in2[0];
-        //				fma(-bot_in2[0], bot_in2[0], accum);
-
-        for(int i = 0; i < MLO_LRN_KERNEL_SZ - 1; i++)
-        {
-            bot_in2[i] = bot_in2[i + 1];
-            bot_in[i]  = bot_in[i + 1];
-        }
-
-        top_off = b * MLO_LRN_TOP_BATCH_STRIDE + c_o * MLO_LRN_TOP_CHANNEL_STRIDE +
-                  (pix_id * MLO_READ_UNIT);
-#if MLO_LRN_DO_SCALE
-        scale_off = b * MLO_LRN_SCALE_BATCH_STRIDE + c_o * MLO_LRN_SCALE_CHANNEL_STRIDE +
-                    (pix_id * MLO_READ_UNIT);
-#endif
-        MLO_READ_TYPE prv_scale = ((MLO_READ_TYPE)K + accum * (MLO_READ_TYPE)alphaoverarea);
-        //				fma(accum,alphaoverarea, (_FLOAT)1.f);
-
-        MLO_READ_TYPE exp_scale = exp((MLO_READ_TYPE)-beta * log(prv_scale));
-        //				pow(prv_scale,-beta);
-        // bug
-        //			MLO_READ_TYPE prv_out = sqrt(bot_in2[MLO_LRN_PRE_PAD]);
-        MLO_READ_TYPE prv_out = bot_in[MLO_LRN_PRE_PAD];
-
-        MLO_READ_TYPE out_val = prv_out * exp_scale;
-#if MLO_LOW_CHNL_COUNT == 1
-        if(c_o < MLO_LRN_N_OUTPUTS)
-#endif
-        {
-
-#if MLO_C1x1_PIXLEFT > 0
-
-            // if the last one
-            if(pix_id == MLO_MAP_SZ4 - 1)
-            {
-                for(int j = 0; j < MLO_C1x1_PIXLEFT; ++j)
-                {
-                    top[top_off + j] = ((_FLOAT*)&out_val)[j];
-#if DBG_OUT
-                    printf("K:o2: %d %f %f %f\n",
-                           top_off + j,
-                           top[top_off + j],
-                           ((_FLOAT*)&prv_out)[j],
-                           ((_FLOAT*)&exp_scale)[j]);
-#endif
-
-#if MLO_LRN_DO_SCALE
-                    scale[scale_off + j] = ((_FLOAT*)&prv_scale)[j];
-#endif
-                }
-            }
-            else
-#endif
-            {
-
-                *((__global MLO_READ_TYPE*)&top[top_off]) = out_val;
-#if MLO_LRN_DO_SCALE
-                *((__global MLO_READ_TYPE*)&scale[scale_off]) = prv_scale;
-#endif
-            }
-        }
-    }
-}
diff --git a/projects/miopen/src/kernels/MIOpenLRNFwd.cpp b/projects/miopen/src/kernels/MIOpenLRNFwd.cpp
new file mode 100644
index 000000000000..084745d7a76d
--- /dev/null
+++ b/projects/miopen/src/kernels/MIOpenLRNFwd.cpp
@@ -0,0 +1,545 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#ifndef MIOPEN_HIP_RUNTIME_COMPILE
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "miopen_math.hpp"
+#include "hip_math_ops.hpp"
+
+using ReadType = typename miopen::mapped_vector_type<FLOAT, READ_UNIT>::type;
+
+constexpr unsigned group_size_z = 1u;
+constexpr unsigned left_pad0    = ((PRE_PAD0 + READ_UNIT - 1) / READ_UNIT) * READ_UNIT;
+constexpr unsigned right_side =
+    ((GROUP_SIZE_X * HORIZ_OUT_PIX + PAD0 + READ_UNIT - 1) / READ_UNIT) * READ_UNIT;
+constexpr unsigned data_width        = left_pad0 + right_side;
+constexpr unsigned local_read4       = data_width / READ_UNIT;
+constexpr unsigned local_data_height = GROUP_SIZE_Y * VERT_OUT_PIX + KERNEL_SIZE - 1;
+constexpr bool low_channel_count     = N_INPUTS < KERNEL_SIZE;
+
+__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__
+    void MIOpenLRNWithinChannel_PS(const FLOAT* bot,
+                                   FLOAT* top,
+#if DO_SCALE
+                                   FLOAT* scale,
+#endif
+                                   FLOAT alphaoverarea,
+                                   FLOAT beta,
+                                   FLOAT K)
+{
+    // Taken from POOLING AVE with stride = 1
+    __shared__ FLOAT bot_data[data_width * local_data_height];
+    const int x       = blockIdx.x * GROUP_SIZE_X * HORIZ_OUT_PIX;
+    const int y       = blockIdx.y * GROUP_SIZE_Y * VERT_OUT_PIX;
+    const int lcl_id0 = threadIdx.x;
+    const int lcl_id1 = threadIdx.y;
+    const int ob      = blockIdx.z * group_size_z + threadIdx.z; // output * batch_sz
+    const int o       = iDiv(ob, BATCH_SIZE);
+    const int b       = iMod(ob, o, BATCH_SIZE);
+    const int bot_x   = x;
+    const int bot_y   = y;
+    const int bot_off = b * BOT_BATCH_STRIDE + o * BOT_CHANNEL_STRIDE;
+
+    // load tile
+    for(int b_j = lcl_id1; b_j < local_data_height; b_j += GROUP_SIZE_Y)
+    {
+        const int bot_y_act   = bot_y + b_j - PRE_PAD1;
+        const bool invisibleY = (bot_y_act < 0) || (bot_y_act >= BOT_HEIGHT);
+        const int bot_y_off   = bot_y_act * BOT_STRIDE;
+        const int lcl_off_v   = __mul24(b_j, (int)data_width);
+
+        for(int b_i = lcl_id0; b_i < local_read4; b_i += GROUP_SIZE_X)
+        {
+            const int bot_x_act = bot_x + (b_i * READ_UNIT) - left_pad0;
+            for(int i = 0; i < READ_UNIT; ++i)
+            {
+                int bot_off_x         = bot_off + bot_y_off + bot_x_act + i;
+                const bool invisibleX = (bot_x_act + i < 0) || (bot_x_act + i >= BOT_WIDTH);
+                bot_off_x             = (invisibleX || invisibleY) ? 0 : bot_off_x;
+
+                FLOAT bot_val = bot[bot_off_x];
+                bot_val       = (invisibleX || invisibleY) ? 0 : bot_val;
+
+                bot_data[lcl_off_v + (b_i * READ_UNIT) + i] = bot_val;
+            }
+        }
+    }
+
+    __syncthreads();
+#if HORIZ_OUT_PIX > 1
+    FLOAT partial_sum_x[HORIZ_OUT_PIX - 1]; // horizontal partial sum
+#endif
+#if VERT_OUT_PIX > 1
+    FLOAT partial_sum_xy[VERT_OUT_PIX - 1][HORIZ_OUT_PIX]; // horizontal-vertical partial sums.
+#endif
+    FLOAT accum[VERT_OUT_PIX][HORIZ_OUT_PIX]; // accumulator
+
+    const int top_y = __mul24(lcl_id1, VERT_OUT_PIX) + y;
+    const int top_x = __mul24(lcl_id0, HORIZ_OUT_PIX) + x;
+
+    const int lcl_y   = __mul24(lcl_id1, VERT_OUT_PIX);
+    const int lcl_x   = __mul24(lcl_id0, HORIZ_OUT_PIX) + (left_pad0 - PRE_PAD0);
+    const int lcl_off = __mul24(lcl_y, data_width) + lcl_x;
+
+    for(int j = 0; j < VERT_OUT_PIX; ++j)
+    {
+        for(int i = 0; i < HORIZ_OUT_PIX; ++i)
+        {
+            accum[j][i] = 0;
+        }
+    }
+
+#if VERT_OUT_PIX > 1
+    for(int j = 0; j < VERT_OUT_PIX - 1; ++j)
+    {
+        for(int i = 0; i < HORIZ_OUT_PIX; ++i)
+        {
+            partial_sum_xy[j][i] = 0;
+        }
+    }
+#endif
+
+    // running window summation
+    int jj = 0;
+    int ii = 0;
+
+    // first to get vertical partial sums
+#if VERT_OUT_PIX > 1
+    for(; jj < (VERT_OUT_PIX - 1); ++jj)
+    {
+        for(ii = 0; ii < (HORIZ_OUT_PIX - 1); ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+
+#if HORIZ_OUT_PIX > 1
+            // save horizontal partial sums
+            partial_sum_x[ii] = accum_tmp;
+#endif
+            // accumulate in vert-horizontal(0)
+            partial_sum_xy[jj][0] += accum_tmp;
+        }
+
+        for(; ii < KERNEL_SIZE0; ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+            // accumulate in vert horizontal(0)
+            partial_sum_xy[jj][0] += accum_tmp;
+        }
+
+        // running horizontal window
+        for(; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+            // calculate all vertical-horizontal partial sums
+            partial_sum_xy[jj][ii - KERNEL_SIZE0 + 1] =
+                partial_sum_xy[jj][ii - KERNEL_SIZE0] + (accum_tmp
+#if HORIZ_OUT_PIX > 1
+                                                         - partial_sum_x[ii - KERNEL_SIZE0]
+#endif
+                                                        );
+        }
+
+        // put into accumulator[0][i]
+        // whatever has been accumulated so far
+        for(int i = 0; i < HORIZ_OUT_PIX; ++i)
+        {
+            accum[0][i] += partial_sum_xy[jj][i];
+        }
+    }
+#endif
+
+    FLOAT mov_accum;
+    // calculate row 0 accumulators
+    for(; jj < KERNEL_SIZE1; ++jj)
+    {
+        mov_accum = 0;
+
+        for(ii = 0; ii < (HORIZ_OUT_PIX - 1); ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+#if HORIZ_OUT_PIX > 1
+            partial_sum_x[ii] = accum_tmp;
+#endif
+            mov_accum += accum_tmp;
+        }
+
+        for(; ii < KERNEL_SIZE0; ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+            mov_accum += accum_tmp;
+        }
+
+        accum[0][0] += mov_accum;
+        // running horizontal window
+        for(; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+            // running horizontal window
+            mov_accum += (accum_tmp
+#if HORIZ_OUT_PIX > 1
+                          - partial_sum_x[ii - KERNEL_SIZE0]
+#endif
+            );
+            accum[0][ii - KERNEL_SIZE0 + 1] += mov_accum;
+        }
+    }
+
+    // accumulate all other rows besides 0
+    for(; jj < (KERNEL_SIZE1 + VERT_OUT_PIX - 1); ++jj)
+    {
+        // first running horizontal winodw as before
+        mov_accum = 0;
+        for(ii = 0; ii < (HORIZ_OUT_PIX - 1); ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+#if HORIZ_OUT_PIX > 1
+            partial_sum_x[ii] = accum_tmp;
+#endif
+            accum[jj - KERNEL_SIZE1 + 1][0] += accum_tmp;
+        }
+        for(; ii < KERNEL_SIZE0; ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+            accum[jj - KERNEL_SIZE1 + 1][0] += accum_tmp;
+        }
+        // running horizontal window
+        int ii1 = ii;
+        for(; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii)
+        {
+            const FLOAT bot_val   = bot_data[lcl_off + jj * data_width + ii];
+            const FLOAT accum_tmp = bot_val * bot_val;
+
+            accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] =
+                accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0] + accum_tmp;
+#if HORIZ_OUT_PIX > 1
+            accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] -= partial_sum_x[ii - KERNEL_SIZE0];
+#endif
+        }
+
+        // finally running vertical window
+        for(ii = ii1; ii < (KERNEL_SIZE0 + HORIZ_OUT_PIX - 1); ++ii)
+        {
+            // finish horizontal summation
+            // add/substarct vertical patial sum
+            accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] +=
+                accum[jj - KERNEL_SIZE1][ii - KERNEL_SIZE0 + 1];
+#if VERT_OUT_PIX > 1
+            accum[jj - KERNEL_SIZE1 + 1][ii - KERNEL_SIZE0 + 1] -=
+                partial_sum_xy[jj - KERNEL_SIZE1][ii - KERNEL_SIZE0 + 1];
+#endif
+        }
+#if VERT_OUT_PIX > 1
+        accum[jj - KERNEL_SIZE1 + 1][0] -= partial_sum_xy[jj - KERNEL_SIZE1][0];
+#endif
+        accum[jj - KERNEL_SIZE1 + 1][0] += accum[jj - KERNEL_SIZE1][0];
+    }
+
+    // normalization
+    FLOAT prv_scale[VERT_OUT_PIX][HORIZ_OUT_PIX];
+    const FLOAT adj_alphaoverarea = alphaoverarea;
+    for(int k = 0; k < VERT_OUT_PIX; k++)
+    {
+        for(int l = 0; l < HORIZ_OUT_PIX; l++)
+        {
+            prv_scale[k][l] = K + accum[k][l] * adj_alphaoverarea;
+        }
+    }
+
+    const int top_off = b * TOP_BATCH_STRIDE + o * TOP_CHANNEL_STRIDE + top_y * TOP_STRIDE + top_x;
+#if DO_SCALE
+    int scale_off =
+        b * SCALE_BATCH_STRIDE + o * SCALE_CHANNEL_STRIDE + top_y * SCALE_STRIDE + top_x;
+#endif
+
+    /*
+       The HIP compiler doesn't automatically unroll this nested loop so we need to
+       use pragma unroll to encourage that for better performance. Additionally, when access is not
+       aligned in the horizontal or vertical access we lift the if condition out of the loop
+       termination if the height/width is a multiple of the vert/horiz pixels, enabling SIMD
+       vectorizatiom of the loop body.
+       These optimizations were being done automatically by the OpenCL compiler prior to porting to
+       HIP.
+    */
+    constexpr bool is_vert_aligned        = (VERT_ALIGNED == 1);
+    constexpr bool is_horiz_aligned       = (HORIZ_ALIGNED == 1);
+    constexpr bool sink_vert_align_check  = !is_vert_aligned && ((TOP_HEIGHT % VERT_OUT_PIX) != 0);
+    constexpr bool sink_horiz_align_check = !is_horiz_aligned && ((TOP_WIDTH % HORIZ_OUT_PIX) != 0);
+    constexpr bool lift_vert_align_check  = !is_vert_aligned && ((TOP_HEIGHT % VERT_OUT_PIX) == 0);
+    constexpr bool lift_horiz_align_check = !is_horiz_aligned && ((TOP_WIDTH % HORIZ_OUT_PIX) == 0);
+
+    // final output
+    if(!lift_vert_align_check || (lift_vert_align_check && (top_y < TOP_HEIGHT)))
+    {
+#pragma unroll VERT_OUT_PIX
+        for(int k = 0; k < VERT_OUT_PIX; k++)
+        {
+            if(sink_vert_align_check && !(top_y + k < TOP_HEIGHT))
+            {
+                break;
+            }
+
+            if(!lift_horiz_align_check || (lift_horiz_align_check && (top_x < TOP_WIDTH)))
+            {
+#pragma unroll HORIZ_OUT_PIX
+                for(int l = 0; l < HORIZ_OUT_PIX; l++)
+                {
+                    if(sink_horiz_align_check && !(top_x + l < TOP_WIDTH))
+                    {
+                        break;
+                    }
+                    const FLOAT s =
+                        miopen::detail::exp(FLOAT(-beta) * miopen::detail::log(prv_scale[k][l]));
+                    int offset    = __mul24((k + PRE_PAD1), data_width) + (l + PRE_PAD0);
+                    FLOAT bot_val = bot_data[lcl_off + offset];
+#if DO_SCALE
+                    scale[scale_off + k * SCALE_STRIDE + l] = prv_scale[k][l];
+#endif
+                    top[top_off + k * TOP_STRIDE + l] = bot_val * s;
+                }
+            }
+        }
+    }
+}
+
+__launch_bounds__(GROUP_SIZE_X* GROUP_SIZE_Y* group_size_z) extern "C" __global__
+    void MIOpenLRNAcrossChannels4(const FLOAT* bottom,
+                                  FLOAT* top,
+#if DO_SCALE
+                                  FLOAT* scale,
+#endif
+                                  FLOAT alphaoverarea,
+                                  FLOAT beta,
+                                  FLOAT K)
+{
+    const int pix_id = blockIdx.x * GROUP_SIZE_X + threadIdx.x;
+    const int batch  = blockIdx.z * group_size_z + threadIdx.z;
+    ReadType accum(0);
+    ReadType bot_in2[KERNEL_SIZE];
+    ReadType bot_in[KERNEL_SIZE];
+    for(int i = 0; i < KERNEL_SIZE; ++i)
+    {
+        bot_in2[i] = ReadType(0);
+        bot_in[i]  = ReadType(0);
+    }
+
+    int top_off = 0;
+#if DO_SCALE
+    int scale_off;
+#endif
+
+    int c_i = 0, c_o = 0; // accumulated throughout the kernel
+    for(; c_i < PAD; c_i++)
+    {
+        ReadType prv_in(0);
+        if(!low_channel_count || c_i < N_INPUTS)
+        {
+            const auto offset =
+                BOT_BATCH_STRIDE * batch + BOT_CHANNEL_STRIDE * c_i + (pix_id * READ_UNIT);
+            // if the last one
+            if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1))
+            {
+                for(int j = 0; j < C1x1_PIXLEFT; ++j)
+                {
+                    FLOAT* prv_in_as_scalar = reinterpret_cast<FLOAT*>(&prv_in);
+                    prv_in_as_scalar[j]     = bottom[offset + j];
+                }
+            }
+            else
+            {
+                const FLOAT* bottom_offset = bottom + offset;
+                prv_in                     = *reinterpret_cast<const ReadType*>(bottom_offset);
+            }
+        }
+
+        bot_in2[c_i] = prv_in * prv_in;
+        bot_in[c_i]  = prv_in;
+        accum        = accum + bot_in2[c_i];
+    }
+
+    for(; c_i < KERNEL_SIZE; c_i++, c_o++)
+    {
+        ReadType prv_in(0);
+        if(!low_channel_count || c_i < N_INPUTS)
+        {
+            const auto offset =
+                BOT_BATCH_STRIDE * batch + BOT_CHANNEL_STRIDE * c_i + (pix_id * READ_UNIT);
+            if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1))
+            {
+                for(int j = 0; j < C1x1_PIXLEFT; ++j)
+                {
+                    FLOAT* prv_in_as_scalar = reinterpret_cast<FLOAT*>(&prv_in);
+                    prv_in_as_scalar[j]     = bottom[offset + j];
+                }
+            }
+            else
+            {
+                const FLOAT* bottom_offset = bottom + offset;
+                prv_in                     = *reinterpret_cast<const ReadType*>(bottom_offset);
+            }
+        }
+
+        bot_in2[c_i] = prv_in * prv_in;
+        bot_in[c_i]  = prv_in;
+        accum        = accum + bot_in2[c_i];
+
+        top_off = batch * TOP_BATCH_STRIDE + c_o * TOP_CHANNEL_STRIDE + (pix_id * READ_UNIT);
+#if DO_SCALE
+        scale_off = batch * SCALE_BATCH_STRIDE + c_o * SCALE_CHANNEL_STRIDE + (pix_id * READ_UNIT);
+#endif
+        const ReadType prv_scale     = (ReadType(K) + accum * ReadType(alphaoverarea));
+        const ReadType prv_scale_log = miopen::log<ReadType>(prv_scale);
+        const ReadType exp_scale     = miopen::exp<ReadType>(ReadType(-beta) * prv_scale_log);
+        const ReadType prv_out       = bot_in[c_o];
+        const ReadType out_val       = prv_out * exp_scale;
+        if(!low_channel_count || c_o < N_INPUTS)
+        {
+            // if the last one
+            if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1))
+            {
+                for(int j = 0; j < C1x1_PIXLEFT; ++j)
+                {
+                    const FLOAT* out_val_as_scalar = reinterpret_cast<const FLOAT*>(&out_val);
+                    top[top_off + j]               = out_val_as_scalar[j];
+#if DO_SCALE
+                    const FLOAT* prv_scale_as_scalar = reinterpret_cast<const FLOAT*>(&prv_scale);
+                    scale[scale_off + j]             = prv_scale_as_scalar[j];
+#endif
+                }
+            }
+            else
+            {
+                FLOAT* top_offset                        = top + top_off;
+                *reinterpret_cast<ReadType*>(top_offset) = out_val;
+
+#if DO_SCALE
+                FLOAT* scale_offset                        = scale + top_off;
+                *reinterpret_cast<ReadType*>(scale_offset) = prv_scale;
+#endif
+            }
+        }
+    }
+
+    for(; c_i < N_INPUTS; c_i++, c_o++)
+    {
+        ReadType prv_in(0);
+        auto offset = BOT_BATCH_STRIDE * batch + BOT_CHANNEL_STRIDE * c_i + (pix_id * READ_UNIT);
+        // if the last one
+        if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1))
+        {
+            for(int j = 0; j < C1x1_PIXLEFT; ++j)
+            {
+                FLOAT* prv_in_as_scalar = reinterpret_cast<FLOAT*>(&prv_in);
+                prv_in_as_scalar[j]     = bottom[offset + j];
+            }
+        }
+        else
+        {
+            const FLOAT* bottom_offset = bottom + offset;
+            prv_in                     = *reinterpret_cast<const ReadType*>(bottom_offset);
+        }
+
+        const ReadType prv_bot_in2 = prv_in * prv_in;
+        accum                      = accum + prv_bot_in2;
+        accum                      = accum - bot_in2[0];
+        for(int i = 0; i < KERNEL_SIZE - 1; i++)
+        {
+            bot_in2[i] = bot_in2[i + 1];
+            bot_in[i]  = bot_in[i + 1];
+        }
+
+        bot_in2[KERNEL_SIZE - 1] = prv_bot_in2;
+        bot_in[KERNEL_SIZE - 1]  = prv_in;
+
+        top_off = batch * TOP_BATCH_STRIDE + c_o * TOP_CHANNEL_STRIDE + (pix_id * READ_UNIT);
+#if DO_SCALE
+        scale_off = batch * SCALE_BATCH_STRIDE + c_o * SCALE_CHANNEL_STRIDE + (pix_id * READ_UNIT);
+#endif
+        const ReadType prv_scale     = (ReadType(K) + accum * ReadType(alphaoverarea));
+        const ReadType prv_scale_log = miopen::log<ReadType>(prv_scale);
+        const ReadType exp_scale     = miopen::exp<ReadType>(ReadType(-beta) * prv_scale_log);
+        const ReadType prv_out       = bot_in[PRE_PAD];
+        const ReadType out_val       = prv_out * exp_scale;
+        if(!low_channel_count || c_o < N_INPUTS)
+        {
+            // if the last one
+            if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1))
+            {
+                for(int j = 0; j < C1x1_PIXLEFT; ++j)
+                {
+                    const FLOAT* out_val_as_scalar = reinterpret_cast<const FLOAT*>(&out_val);
+                    top[top_off + j]               = out_val_as_scalar[j];
+#if DO_SCALE
+                    const FLOAT* prv_scale_as_scalar = reinterpret_cast<const FLOAT*>(&prv_scale);
+                    scale[scale_off + j]             = prv_scale_as_scalar[j];
+#endif
+                }
+            }
+            else
+            {
+                FLOAT* top_offset                        = top + top_off;
+                *reinterpret_cast<ReadType*>(top_offset) = out_val;
+
+#if DO_SCALE
+                FLOAT* scale_offset                        = scale + top_off;
+                *reinterpret_cast<ReadType*>(scale_offset) = prv_scale;
+#endif
+            }
+        }
+    }
+
+    for(; c_i < N_INPUTS + PAD; c_i++, c_o++)
+    {
+        accum = accum - bot_in2[0];
+        for(int i = 0; i < KERNEL_SIZE - 1; i++)
+        {
+            bot_in2[i] = bot_in2[i + 1];
+            bot_in[i]  = bot_in[i + 1];
+        }
+
+        top_off = batch * TOP_BATCH_STRIDE + c_o * TOP_CHANNEL_STRIDE + (pix_id * READ_UNIT);
+#if DO_SCALE
+        scale_off = batch * SCALE_BATCH_STRIDE + c_o * SCALE_CHANNEL_STRIDE + (pix_id * READ_UNIT);
+#endif
+        const ReadType prv_scale     = (ReadType(K) + accum * ReadType(alphaoverarea));
+        const ReadType prv_scale_log = miopen::log<ReadType>(prv_scale);
+        const ReadType exp_scale     = miopen::exp<ReadType>(ReadType(-beta) * prv_scale_log);
+        const ReadType prv_out       = bot_in[PRE_PAD];
+        const ReadType out_val       = prv_out * exp_scale;
+        if(!low_channel_count || c_o < N_INPUTS)
+        {
+            // if the last one
+            if(C1x1_PIXLEFT > 0 && (pix_id == MAP_SZ_4 - 1))
+            {
+                for(int j = 0; j < C1x1_PIXLEFT; ++j)
+                {
+                    const FLOAT* out_val_as_scalar = reinterpret_cast<const FLOAT*>(&out_val);
+                    top[top_off + j]               = out_val_as_scalar[j];
+#if DO_SCALE
+                    const FLOAT* prv_scale_as_scalar = reinterpret_cast<const FLOAT*>(&prv_scale);
+                    scale[scale_off + j]             = prv_scale_as_scalar[j];
+#endif
+                }
+            }
+            else
+            {
+                FLOAT* top_offset                        = top + top_off;
+                *reinterpret_cast<ReadType*>(top_offset) = out_val;
+#if DO_SCALE
+                FLOAT* scale_offset                        = scale + top_off;
+                *reinterpret_cast<ReadType*>(scale_offset) = prv_scale;
+#endif
+            }
+        }
+    }
+}
diff --git a/projects/miopen/src/kernels/hip_math_ops.hpp b/projects/miopen/src/kernels/hip_math_ops.hpp
index b9864cda2d7d..7e51f7b59221 100644
--- a/projects/miopen/src/kernels/hip_math_ops.hpp
+++ b/projects/miopen/src/kernels/hip_math_ops.hpp
@@ -12,3 +12,8 @@ inline __device__ unsigned int iRemquo(unsigned int x, unsigned int y, unsigned
     k = x / y;
     return x - __mul24(k, y);
 }
+
+// Responsibility of caller to ensure that `u`, `d`, and `u*d` don't exceed the 24-bit range.
+inline __device__ unsigned iMod(unsigned v, unsigned u, unsigned d) { return v - __mul24(u, d); }
+
+inline __device__ unsigned iDiv(unsigned v, unsigned d) { return v / d; }
\ No newline at end of file
diff --git a/projects/miopen/src/ocl/lrn_ocl.cpp b/projects/miopen/src/ocl/lrn_ocl.cpp
index 07fad30e3ce5..7fbcc7ea8aa3 100644
--- a/projects/miopen/src/ocl/lrn_ocl.cpp
+++ b/projects/miopen/src/ocl/lrn_ocl.cpp
@@ -126,7 +126,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle,
                                 y,
                                 workSpace,
                                 as_float(f_norm_alphaoverarea),
-                                as_float(f_norm_alpha),
                                 as_float(f_norm_beta),
                                 as_float(f_norm_K));
             }
@@ -135,7 +134,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle,
                 kernels.front()(x,
                                 y,
                                 as_float(f_norm_alphaoverarea),
-                                as_float(f_norm_alpha),
                                 as_float(f_norm_beta),
                                 as_float(f_norm_K));
             }
@@ -143,7 +141,7 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle,
     }
     else
     {
-        const std::string program_name = construct_params.getKernelFile(); // CL kernel filename
+        const std::string program_name = construct_params.getKernelFile(); // kernel filename
         const std::string kernel_name  = construct_params.getKernelName(); // kernel name
         const std::string& compiler_parms =
             construct_params.getCompilerOptions(); // kernel parameters
@@ -159,7 +157,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle,
                     y,
                     workSpace,
                     as_float(f_norm_alphaoverarea),
-                    as_float(f_norm_alpha),
                     as_float(f_norm_beta),
                     as_float(f_norm_K));
             }
@@ -168,7 +165,6 @@ miopenStatus_t LRNDescriptor::Forward(const Handle& handle,
                 obj(x,
                     y,
                     as_float(f_norm_alphaoverarea),
-                    as_float(f_norm_alpha),
                     as_float(f_norm_beta),
                     as_float(f_norm_K));
             }
diff --git a/projects/miopen/src/ocl/mloNorm.cpp b/projects/miopen/src/ocl/mloNorm.cpp
index fc58e1b4ccc7..946a2c04357d 100644
--- a/projects/miopen/src/ocl/mloNorm.cpp
+++ b/projects/miopen/src/ocl/mloNorm.cpp
@@ -49,10 +49,8 @@ inline bool is_tensor_packed(int c, int h, int w, int b_str, int c_str, int h_st
     return h_str == w && c_str == h * h_str && b_str == c * c_str;
 }
 
-int mlo_construct_norm::mloConstructFwd()
+void mlo_construct_norm::mloConstructFwd()
 {
-    int ret = 0;
-
     size_t maxComputeUnits = _ctx.GetStream().GetMaxComputeUnits();
 
     _hw_wave_sz = 64;
@@ -63,14 +61,6 @@ int mlo_construct_norm::mloConstructFwd()
     if(pre_pad < 0 || pad < 0)
         MIOPEN_THROW("Wrong LRN kernel size");
 
-    int top_df_stride         = 1;
-    int top_df_channel_stride = 1;
-    int top_df_batch_stride   = 1;
-
-    int bot_df_stride         = 1;
-    int bot_df_channel_stride = 1;
-    int bot_df_batch_stride   = 1;
-
     _grp_tile0     = (_problem.GetOutWidth() <= 16) ? 8 : 16;
     _grp_tile1     = 8;
     _out_pix_tile0 = 1;
@@ -83,14 +73,14 @@ int mlo_construct_norm::mloConstructFwd()
                                          _problem.GetInChannelStride(),
                                          _problem.GetInStride());
 
-    int MAP_SZ4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1);
+    int map_size_4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1);
     int read_unit;
     if(_norm_region == MLO_LRN_ACROSS_CHANNELS)
     {
         _grp_tile0 = (_problem.GetOutWidth() <= 8) ? 8 : 16;
         _grp_tile1 = (_problem.GetOutHeight() <= 8) ? 8 : 16;
-        read_unit  = (MAP_SZ4 % 4 == 0) ? 4 : (MAP_SZ4 % 2 == 0) ? 2 : 1;
-        MAP_SZ4 /= read_unit;
+        read_unit  = (map_size_4 % 4 == 0) ? 4 : (map_size_4 % 2 == 0) ? 2 : 1;
+        map_size_4 /= read_unit;
     }
     else
     {
@@ -98,31 +88,26 @@ int mlo_construct_norm::mloConstructFwd()
         _out_pix_tile0 = (_problem.GetOutWidth() <= 8) ? 1 : 2;
         _out_pix_tile1 = (_problem.GetOutHeight() <= 8) ? 1 : 2;
         read_unit      = 4;
-        MAP_SZ4        = (MAP_SZ4 + 3) / 4;
+        map_size_4     = (map_size_4 + 3) / 4;
     }
-    MAP_SZ4 *= (is_in_packed ? 1 : _problem.GetInHeight());
+    map_size_4 *= (is_in_packed ? 1 : _problem.GetInHeight());
 
     assert(_out_pix_tile0 - 1 <= _norm_area && _out_pix_tile1 - 1 <= _norm_area);
 
-    auto ocl_group_lg2sz0 =
-        static_cast<int>(ceil(log(static_cast<double>(_out_pix_tile0)) / std::numbers::ln2));
-    auto ocl_group_lg2sz1 =
-        static_cast<int>(ceil(log(static_cast<double>(_out_pix_tile1)) / std::numbers::ln2));
-
-    _kernel_file = "MIOpenLRNFwd.cl";
+    _kernel_file = "MIOpenLRNFwd.cpp";
     _kernel_name = (_norm_region == MLO_LRN_ACROSS_CHANNELS) ? "MIOpenLRNAcrossChannels4"
                                                              : "MIOpenLRNWithinChannel_PS";
     if(_norm_region == MLO_LRN_ACROSS_CHANNELS)
     {
         _grp_tile0  = 8 * 8;
         _grp_tile1  = 1;
-        int n_waves = (_problem.GetBatchSize() * MAP_SZ4 + _hw_wave_sz - 1) / _hw_wave_sz;
+        int n_waves = (_problem.GetBatchSize() * map_size_4 + _hw_wave_sz - 1) / _hw_wave_sz;
         if(n_waves <= maxComputeUnits * 8)
         {
-            MAP_SZ4   = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1);
-            read_unit = (MAP_SZ4 % 2 == 0) ? 2 : 1;
-            MAP_SZ4 /= read_unit;
-            MAP_SZ4 *= (is_in_packed ? 1 : _problem.GetInHeight());
+            map_size_4 = _problem.GetInWidth() * (is_in_packed ? _problem.GetInHeight() : 1);
+            read_unit  = (map_size_4 % 2 == 0) ? 2 : 1;
+            map_size_4 /= read_unit;
+            map_size_4 *= (is_in_packed ? 1 : _problem.GetInHeight());
         }
     }
 
@@ -135,8 +120,8 @@ int mlo_construct_norm::mloConstructFwd()
         {
             MIOPEN_LOG_I("Workaround for #1057: "
                          << name << ',' << miopen::GetDataTypeName(_problem.GetInDataType()) << ','
-                         << MAP_SZ4 << ',' << read_unit);
-            MAP_SZ4 *= read_unit;
+                         << map_size_4 << ',' << read_unit);
+            map_size_4 *= read_unit;
             read_unit = 1;
         }
     }
@@ -151,96 +136,65 @@ int mlo_construct_norm::mloConstructFwd()
     auto g_wk_height =
         static_cast<int>((_problem.GetOutHeight() + _grp_tile1 * _out_pix_tile1 - 1) /
                          (_grp_tile1 * _out_pix_tile1));
-    int OUT_VERT_ALIGNED =
+    int out_vert_aligned =
         (g_wk_height * (_grp_tile1 * _out_pix_tile1) == _problem.GetOutHeight()) ? 1 : 0;
-    int OUT_HORIZ_ALIGNED =
+    int out_horiz_aligned =
         (g_wk_width * (_grp_tile0 * _out_pix_tile0) == _problem.GetOutWidth()) ? 1 : 0;
     // currently always 1
-    int DIVBY4 = (MAP_SZ4 * read_unit == _problem.GetInWidth() * _problem.GetInHeight()) ? 1 : 0;
-    int C1x1_PIXLEFT =
-        (DIVBY4 == 1) ? 0
-                      : _problem.GetInWidth() * _problem.GetInHeight() - (MAP_SZ4 - 1) * read_unit;
-
-    std::string READ_TYPE =
-        (read_unit == 1) ? "_FLOAT" : "_FLOAT" + std::to_string(static_cast<long long>(read_unit));
-
+    bool div_by_4 = (map_size_4 * read_unit == _problem.GetInWidth() * _problem.GetInHeight());
+    int c1x1_pixleft =
+        div_by_4 ? 0
+                 : _problem.GetInWidth() * _problem.GetInHeight() - (map_size_4 - 1) * read_unit;
     _comp_options =
-        std::string(" -DMLO_LRN_KERNEL_SZ=") + std::to_string(static_cast<long long>(_norm_area)) +
-        std::string(" -DMLO_LRN_PAD=") + std::to_string(static_cast<long long>(pad)) +
-        std::string(" -DMLO_LRN_KERNEL_SZ1=") + std::to_string(static_cast<long long>(_norm_area)) +
-        std::string(" -DMLO_LRN_PAD1=") + std::to_string(static_cast<long long>(pad)) +
-        std::string(" -DMLO_LRN_KERNEL_SZ0=") + std::to_string(static_cast<long long>(_norm_area)) +
-        std::string(" -DMLO_LRN_PAD0=") + std::to_string(static_cast<long long>(pad)) +
-        std::string(" -DMLO_LRN_PRE_PAD=") + std::to_string(static_cast<long long>(pre_pad)) +
-        std::string(" -DMLO_LRN_PRE_PAD1=") + std::to_string(static_cast<long long>(pre_pad)) +
-        std::string(" -DMLO_LRN_PRE_PAD0=") + std::to_string(static_cast<long long>(pre_pad)) +
-        std::string(" -DMLO_LRN_N_OUTPUTS=") +
+        std::string(" -DKERNEL_SIZE=") + std::to_string(static_cast<long long>(_norm_area)) +
+        std::string(" -DPAD=") + std::to_string(static_cast<long long>(pad)) +
+        std::string(" -DKERNEL_SIZE1=") + std::to_string(static_cast<long long>(_norm_area)) +
+        std::string(" -DPAD0=") + std::to_string(static_cast<long long>(pad)) +
+        std::string(" -DPRE_PAD=") + std::to_string(static_cast<long long>(pre_pad)) +
+        std::string(" -DPRE_PAD1=") + std::to_string(static_cast<long long>(pre_pad)) +
+        std::string(" -DKERNEL_SIZE0=") + std::to_string(static_cast<long long>(_norm_area)) +
+        std::string(" -DPRE_PAD0=") + std::to_string(static_cast<long long>(pre_pad)) +
+        std::string(" -DN_OUTPUTS=") +
         std::to_string(static_cast<long long>(_problem.GetOutChannels())) +
-        std::string(" -DMLO_LRN_N_INPUTS=") +
+        std::string(" -DN_INPUTS=") +
         std::to_string(static_cast<long long>(_problem.GetInChannels())) +
-        std::string(" -DMLO_LRN_N_HORIZ_OUT_PIX=") +
-        std::to_string(static_cast<long long>(_out_pix_tile0)) +
-        std::string(" -DMLO_LRN_N_VERT_OUT_PIX=") +
-        std::to_string(static_cast<long long>(_out_pix_tile1)) +
-        std::string(" -DMLO_LRN_GROUP_SZ0=") + std::to_string(static_cast<long long>(_grp_tile0)) +
-        std::string(" -DMLO_LRN_GROUP_SZ1=") + std::to_string(static_cast<long long>(_grp_tile1)) +
-        std::string(" -DMLO_LRN_GROUP_LG2SZ0=") +
-        std::to_string(static_cast<long long>(ocl_group_lg2sz0)) +
-        std::string(" -DMLO_LRN_GROUP_LG2SZ1=") +
-        std::to_string(static_cast<long long>(ocl_group_lg2sz1)) +
-        std::string(" -DMLO_LRN_BOT_BATCH_STRIDE=") +
+        std::string(" -DHORIZ_OUT_PIX=") + std::to_string(static_cast<long long>(_out_pix_tile0)) +
+        std::string(" -DVERT_OUT_PIX=") + std::to_string(static_cast<long long>(_out_pix_tile1)) +
+        std::string(" -DGROUP_SIZE_X=") + std::to_string(static_cast<long long>(_grp_tile0)) +
+        std::string(" -DGROUP_SIZE_Y=") + std::to_string(static_cast<long long>(_grp_tile1)) +
+        std::string(" -DBOT_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetInBatchStride())) +
-        std::string(" -DMLO_LRN_BOT_CHANNEL_STRIDE=") +
+        std::string(" -DBOT_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetInChannelStride())) +
-        std::string(" -DMLO_LRN_BOT_STRIDE=") +
+        std::string(" -DBOT_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetInStride())) +
-        std::string(" -DMLO_LRN_TOP_BATCH_STRIDE=") +
+        std::string(" -DTOP_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetOutBatchStride())) +
-        std::string(" -DMLO_LRN_TOP_CHANNEL_STRIDE=") +
+        std::string(" -DTOP_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetOutChannelStride())) +
-        std::string(" -DMLO_LRN_TOP_STRIDE=") +
+        std::string(" -DTOP_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetOutStride())) +
-        std::string(" -DMLO_LRN_BOT_WIDTH=") +
+        std::string(" -DBOT_WIDTH=") +
         std::to_string(static_cast<long long>(_problem.GetOutWidth())) +
-        std::string(" -DMLO_LRN_BOT_HEIGHT=") +
+        std::string(" -DBOT_HEIGHT=") +
         std::to_string(static_cast<long long>(_problem.GetOutHeight())) +
-        std::string(" -DMLO_LRN_TOP_WIDTH=") +
+        std::string(" -DTOP_WIDTH=") +
         std::to_string(static_cast<long long>(_problem.GetOutWidth())) +
-        std::string(" -DMLO_LRN_TOP_HEIGHT=") +
+        std::string(" -DTOP_HEIGHT=") +
         std::to_string(static_cast<long long>(_problem.GetOutHeight())) +
-        std::string(" -DMLO_LRN_SCALE_BATCH_STRIDE=") +
+        std::string(" -DSCALE_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(scale_batch_stride)) +
-        std::string(" -DMLO_LRN_SCALE_CHANNEL_STRIDE=") +
+        std::string(" -DSCALE_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(scale_channel_stride)) +
-        std::string(" -DMLO_LRN_SCALE_STRIDE=") +
-        std::to_string(static_cast<long long>(scale_stride)) +
-        std::string(" -DMLO_LRN_TOPDF_BATCH_STRIDE=") +
-        std::to_string(static_cast<long long>(top_df_batch_stride)) +
-        std::string(" -DMLO_LRN_TOPDF_CHANNEL_STRIDE=") +
-        std::to_string(static_cast<long long>(top_df_channel_stride)) +
-        std::string(" -DMLO_LRN_TOPDF_STRIDE=") +
-        std::to_string(static_cast<long long>(top_df_stride)) +
-        std::string(" -DMLO_LRN_BOTDF_BATCH_STRIDE=") +
-        std::to_string(static_cast<long long>(bot_df_batch_stride)) +
-        std::string(" -DMLO_LRN_BOTDF_CHANNEL_STRIDE=") +
-        std::to_string(static_cast<long long>(bot_df_channel_stride)) +
-        std::string(" -DMLO_LRN_BOTDF_STRIDE=") +
-        std::to_string(static_cast<long long>(bot_df_stride)) +
-        std::string(" -DMLO_LRN_BATCH_SZ=") +
+        std::string(" -DSCALE_STRIDE=") + std::to_string(static_cast<long long>(scale_stride)) +
+        std::string(" -DBATCH_SIZE=") +
         std::to_string(static_cast<long long>(_problem.GetBatchSize())) +
-        std::string(" -DMLO_LRN_N_INPUTS=") +
-        std::to_string(static_cast<long long>(_problem.GetInChannels())) +
-        std::string(" -DMLO_LRN_N_OUTPUTS=") +
-        std::to_string(static_cast<long long>(_problem.GetOutChannels())) +
-        std::string(" -DMLO_LRN_DO_SCALE=") + std::to_string(static_cast<long long>(scale)) +
-        std::string(" -DMLO_OUT_VERT_ALIGNED=") +
-        std::to_string(static_cast<long long>(OUT_VERT_ALIGNED)) +
-        std::string(" -DMLO_OUT_HORIZ_ALIGNED=") +
-        std::to_string(static_cast<long long>(OUT_HORIZ_ALIGNED)) + std::string(" -DMLO_MAP_SZ4=") +
-        std::to_string(static_cast<long long>(MAP_SZ4)) + std::string(" -DMLO_C1x1_PIXLEFT=") +
-        std::to_string(static_cast<long long>(C1x1_PIXLEFT)) + std::string(" -DMLO_DIVBY4=") +
-        std::to_string(static_cast<long long>(DIVBY4)) + std::string(" -DMLO_READ_TYPE=") +
-        READ_TYPE + std::string(" -DMLO_READ_UNIT=") +
+        std::string(" -DDO_SCALE=") + std::to_string(static_cast<long long>(scale)) +
+        std::string(" -DVERT_ALIGNED=") + std::to_string(static_cast<long long>(out_vert_aligned)) +
+        std::string(" -DHORIZ_ALIGNED=") +
+        std::to_string(static_cast<long long>(out_horiz_aligned)) + std::string(" -DMAP_SZ_4=") +
+        std::to_string(static_cast<long long>(map_size_4)) + std::string(" -DC1x1_PIXLEFT=") +
+        std::to_string(static_cast<long long>(c1x1_pixleft)) + std::string(" -DREAD_UNIT=") +
         std::to_string(static_cast<long long>(read_unit)) + getGeneralCompOptions();
 
     _l_wk.clear();
@@ -252,7 +206,7 @@ int mlo_construct_norm::mloConstructFwd()
     if(_norm_region == MLO_LRN_ACROSS_CHANNELS)
     {
 
-        _g_wk.push_back(MAP_SZ4);
+        _g_wk.push_back(map_size_4);
         _g_wk.push_back(1);
         _g_wk.push_back(_problem.GetBatchSize());
     }
@@ -268,14 +222,10 @@ int mlo_construct_norm::mloConstructFwd()
     // calculate workspace
     size_t scale_sz = static_cast<size_t>(_problem.GetBatchSize()) * scale_batch_stride * data_len;
     _workspace_sz   = (doBackward()) ? scale_sz : 0;
-
-    return (ret);
 }
 
-int mlo_construct_norm::mloConstructBwd()
+void mlo_construct_norm::mloConstructBwd()
 {
-    int ret = 0;
-
     _out_pix_tile0 = 1;
     _out_pix_tile1 = 1;
     _grp_tile0     = 8;
@@ -290,10 +240,6 @@ int mlo_construct_norm::mloConstructBwd()
         _out_pix_tile0 = (_in_df_width <= 8) ? 1 : (_in_df_width <= 16) ? 2 : 4;
         _out_pix_tile1 = (_in_df_height <= 8) ? 1 : (_in_df_height <= 16) ? 2 : 4;
     }
-    auto ocl_group_lg2sz0 =
-        static_cast<int>(ceil(log(static_cast<double>(_grp_tile0)) / std::numbers::ln2));
-    auto ocl_group_lg2sz1 =
-        static_cast<int>(ceil(log(static_cast<double>(_grp_tile1)) / std::numbers::ln2));
 
     int pre_pad              = (_norm_area - 1) / 2;
     int pad                  = _norm_area - pre_pad - 1;
@@ -305,69 +251,55 @@ int mlo_construct_norm::mloConstructBwd()
         MIOPEN_THROW("Wrong LRN kernel size");
 
     _comp_options =
-        std::string(" -DMLO_LRN_KERNEL_SZ=") + std::to_string(static_cast<long long>(_norm_area)) +
-        std::string(" -DMLO_LRN_N_OUTPUTS=") +
-        std::to_string(static_cast<long long>(_problem.GetOutChannels())) +
-        std::string(" -DMLO_LRN_N_CHANNELS=") +
-        std::to_string(static_cast<long long>(_problem.GetInChannels())) +
-        std::string(" -DMLO_LRN_PAD=") + std::to_string(static_cast<long long>(pad)) +
-        std::string(" -DMLO_LRN_PRE_PAD=") + std::to_string(static_cast<long long>(pre_pad)) +
-        std::string(" -DMLO_LRN_N_HORIZ_OUT_PIX=") +
-        std::to_string(static_cast<long long>(_out_pix_tile0)) +
-        std::string(" -DMLO_LRN_N_VERT_OUT_PIX=") +
-        std::to_string(static_cast<long long>(_out_pix_tile1)) +
-        std::string(" -DMLO_LRN_GROUP_SZ0=") + std::to_string(static_cast<long long>(_grp_tile0)) +
-        std::string(" -DMLO_LRN_GROUP_SZ1=") + std::to_string(static_cast<long long>(_grp_tile1)) +
-        std::string(" -DMLO_LRN_GROUP_LG2SZ0=") +
-        std::to_string(static_cast<long long>(ocl_group_lg2sz0)) +
-        std::string(" -DMLO_LRN_GROUP_LG2SZ1=") +
-        std::to_string(static_cast<long long>(ocl_group_lg2sz1)) +
-        std::string(" -DMLO_LRN_BOT_BATCH_STRIDE=") +
+        std::string(" -DKERNEL_SIZE=") + std::to_string(static_cast<long long>(_norm_area)) +
+        std::string(" -DOUT_CHANNELS=") +
+        std::to_string(static_cast<long long>(_problem.GetOutChannels())) + std::string(" -DPAD=") +
+        std::to_string(static_cast<long long>(pad)) + std::string(" -DPRE_PAD=") +
+        std::to_string(static_cast<long long>(pre_pad)) + std::string(" -DHORIZ_OUT_PIX=") +
+        std::to_string(static_cast<long long>(_out_pix_tile0)) + std::string(" -DVERT_OUT_PIX=") +
+        std::to_string(static_cast<long long>(_out_pix_tile1)) + std::string(" -DGROUP_SIZE_X=") +
+        std::to_string(static_cast<long long>(_grp_tile0)) + std::string(" -DGROUP_SIZE_Y=") +
+        std::to_string(static_cast<long long>(_grp_tile1)) + std::string(" -DBOT_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetInBatchStride())) +
-        std::string(" -DMLO_LRN_BOT_CHANNEL_STRIDE=") +
+        std::string(" -DBOT_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetInChannelStride())) +
-        std::string(" -DMLO_LRN_BOT_STRIDE=") +
+        std::string(" -DBOT_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetInStride())) +
-        std::string(" -DMLO_LRN_TOP_BATCH_STRIDE=") +
+        std::string(" -DTOP_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetOutBatchStride())) +
-        std::string(" -DMLO_LRN_TOP_CHANNEL_STRIDE=") +
+        std::string(" -DTOP_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetOutChannelStride())) +
-        std::string(" -DMLO_LRN_TOP_STRIDE=") +
+        std::string(" -DTOP_STRIDE=") +
         std::to_string(static_cast<long long>(_problem.GetOutStride())) +
-        std::string(" -DMLO_LRN_BOT_WIDTH=") +
+        std::string(" -DBOT_WIDTH=") +
         std::to_string(static_cast<long long>(_problem.GetInWidth())) +
-        std::string(" -DMLO_LRN_BOT_HEIGHT=") +
+        std::string(" -DBOT_HEIGHT=") +
         std::to_string(static_cast<long long>(_problem.GetInHeight())) +
-        std::string(" -DMLO_LRN_TOP_WIDTH=") +
+        std::string(" -DTOP_WIDTH=") +
         std::to_string(static_cast<long long>(_problem.GetOutWidth())) +
-        std::string(" -DMLO_LRN_TOP_HEIGHT=") +
+        std::string(" -DTOP_HEIGHT=") +
         std::to_string(static_cast<long long>(_problem.GetOutHeight())) +
-        std::string(" -DMLO_LRN_SCALE_BATCH_STRIDE=") +
+        std::string(" -DSCALE_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(scale_batch_stride)) +
-        std::string(" -DMLO_LRN_SCALE_CHANNEL_STRIDE=") +
+        std::string(" -DSCALE_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(scale_channel_stride)) +
-        std::string(" -DMLO_LRN_SCALE_STRIDE=") +
-        std::to_string(static_cast<long long>(scale_stride)) +
-        std::string(" -DMLO_LRN_TOPDF_BATCH_STRIDE=") +
+        std::string(" -DSCALE_STRIDE=") + std::to_string(static_cast<long long>(scale_stride)) +
+        std::string(" -DTOPDF_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(_out_df_batch_stride)) +
-        std::string(" -DMLO_LRN_TOPDF_CHANNEL_STRIDE=") +
+        std::string(" -DTOPDF_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(_out_df_channel_stride)) +
-        std::string(" -DMLO_LRN_TOPDF_STRIDE=") +
-        std::to_string(static_cast<long long>(_out_df_stride)) +
-        std::string(" -DMLO_LRN_BOTDF_BATCH_STRIDE=") +
+        std::string(" -DTOPDF_STRIDE=") + std::to_string(static_cast<long long>(_out_df_stride)) +
+        std::string(" -DBOTDF_BATCH_STRIDE=") +
         std::to_string(static_cast<long long>(_in_df_batch_stride)) +
-        std::string(" -DMLO_LRN_BOTDF_CHANNEL_STRIDE=") +
+        std::string(" -DBOTDF_CHANNEL_STRIDE=") +
         std::to_string(static_cast<long long>(_in_df_channel_stride)) +
-        std::string(" -DMLO_LRN_BOTDF_STRIDE=") +
-        std::to_string(static_cast<long long>(_in_df_stride)) +
-        std::string(" -DMLO_LRN_BATCH_SZ=") +
+        std::string(" -DBOTDF_STRIDE=") + std::to_string(static_cast<long long>(_in_df_stride)) +
+        std::string(" -DBATCH_SIZE=") +
         std::to_string(static_cast<long long>(_problem.GetBatchSize())) +
-        std::string(" -DMLO_LRN_N_INPUTS=") +
-        std::to_string(static_cast<long long>(_problem.GetInChannels())) +
-        std::string(" -DMLO_LRN_N_OUTPUTS=") +
-        std::to_string(static_cast<long long>(_problem.GetOutChannels())) + getGeneralCompOptions();
+        std::string(" -DN_INPUTS=") +
+        std::to_string(static_cast<long long>(_problem.GetInChannels())) + getGeneralCompOptions();
 
-    _kernel_file = "MIOpenLRNBwd.cl";
+    _kernel_file = "MIOpenLRNBwd.cpp";
 
     _l_wk.clear();
     _g_wk.clear();
@@ -394,6 +326,4 @@ int mlo_construct_norm::mloConstructBwd()
         _g_wk.push_back(static_cast<size_t>(_problem.GetInChannels()) * _problem.GetBatchSize());
         _kernel_name = "MIOpenLRNWithinChannelBwd";
     }
-
-    return (ret);
 }