[hipblaslt] Fix fails with dtl.yaml and xfp32.yaml on gfx950_mx_rebase (#4906)

nakajee · web-flow · commit 1c2fe0e3fa37 · 2026-02-26T13:54:14.000-07:00
## Motivation Fix fails with dtl.yaml and xfp32.yaml on gfx950_mx_rebase branch ## Technical Details - Fixed merge issue with if kernel["ScheduleIterAlg"] == 3 - Added int cast for float value const in asm - Fixed incorrect parameter for calcLdsBlockSizePerPad() - Fixed incorrect local read calculation due to incorrectly applying MX logic to TF32 emulation - Fixed incorrect ShiftK code vreg due to missing if condition for TF32 emulation ## Test Plan ran dtl.yaml and xfp32.yaml on gfx950_mx_rebase branch ## Test Result All passed except for a known issue (should be fixed with latest develop branch) ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/LocalRead.py b/projects/hipblaslt/tensilelite/Tensile/Components/LocalRead.py
@@ -252,7 +252,7 @@ def __call__(self, writer, kernel, bufferIdx, iui, epsi, tP):
         # fp64 TLU=1 reading 0.5element/lane/read..
         # for TLU=0 case, blockWidth and LRVW should match
         miInputPerGroup = kernel["MIInputPerThread%s"%tc]
-        if writer.states.asmCaps["HasMFMA_f8f6f4"] and ((tP["bpeDS"] * miInputPerGroup) > 24):
+        if writer.states.asmCaps["HasMFMA_f8f6f4"] and ((tP["bpeDS"] * miInputPerGroup) > 24) and not kernel["UseF32XEmulation"]:
           miInputPerGroup = int(16 / tP["bpeDS"])
         miInputGroup = kernel["MIInputPerThread%s"%tc] // miInputPerGroup
         numReadsPerUnroll = ceil(tP["bpeDS"] * miInputPerGroup / int(unrollBlockWidth * bpr))
diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py
@@ -1872,7 +1872,7 @@ def calculateRangeAndUpdateCounter(itemCounter, writeCounters, length):
         localReads += (localReadsA + localReadsB + localReadsMXSA + localReadsMXSB)
 
         # some of localReads is interleaved after waitcnt in SIA3
-        if kernel["ScheduleIterAlg"] == 3 and self.states.numItersPLR and\
+        if scheduleIterAlg == 3 and self.states.numItersPLR and\
           (iteration < maxNumberReadIter or numPrefetchIter):
           if ((iteration < numReadsIterA and not dataAtIterA < maxDataAtIter) or numPrefetchIter) and (not kernel["DirectToVgprA"]):
             localReads -= self.states.numReadsPerIterA * readFactorA
@@ -4024,9 +4024,9 @@ def _initKernel(self, kernel, tensorParametersA, tensorParametersB):
       unitA = 1
       unitB = 1
       if ((not tluA) and (bpeGRA * asem < 4) and grvwa > 1):
-        unitA = 4 // (bpeGRA * asem)
+        unitA = int(4 / (bpeGRA * asem))
       if ((not tluB) and (bpeGRB * asem < 4) and grvwb > 1):
-        unitB = 4 // (bpeGRB * asem)
+        unitB = int(4 / (bpeGRB * asem))
       self.states.tailloopInNllmaxUnit = max(unitA, unitB)
 
     # Only assembly supports scheduling
diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py
@@ -5282,7 +5282,7 @@ def generateFindTheLastElementLocation(tc):
                          comment="Calculate the remaining dimension along I/J direction."))
         imod.add(SSubU32(dst=sgpr(sTmp0), src0=sgpr(strSize), src1=sgpr(sTmp0), \
                          comment="Calculate the remaining dimension along I/J direction."))
-        imod.add(SMulI32(dst=sgpr(sTmp0), src0=sgpr(sTmp0), src1=tP["bpeGR"], \
+        imod.add(SMulI32(dst=sgpr(sTmp0), src0=sgpr(sTmp0), src1=int(tP["bpeGR"]), \
                          comment="In bytes"))
         imod.add(SAndB32(dst=sgpr(sTmp1), src0=sgpr("SizeL"), src1=(kernel["DepthU"] - 1), \
                          comment="Calculate the remaining dimension along L direction."))
@@ -6611,6 +6611,8 @@ def generateSrcStrForMFMA(self, kernel, tP, innerUnroll, vregSetIdx, vgprPerInpu
     iui_new_offset = iui%numReadsIterCoalesced*vgprPerInput
     ab_new = idxAB*vgprPerInput*numReadsIterCoalesced
     abStr = "Valu%s_X%u_I%u+%u+%u+%u" % (tc, vgprBuffer_new, iui_new, ab_new, vgprBuffer_new_offset, iui_new_offset)
+    if kernel["UseDirect32XEmulation"] and bk != None and (int(bk) % 8) < 4:
+      abStr = "Valu%c_T%u_I%u+%u+%u+%u" % (tc, vgprBuffer_new, iui_new, ab_new // 2, vgprBuffer_new_offset, iui_new_offset)
     if kernel["DirectToVgpr%s"%tc] and not (packDTV or convDTV):
       # overwrite aStr/bStr for DirectToVgpr (except for pack DTV case)
       numVgprPerBlock = statesTc.numVgprG2LAllocated
diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py
@@ -2947,7 +2947,7 @@ def calSwizzlePackK(state, tc):
     auto_LdsBlockSizePerPadB_for_mix = 0
     if state["LdsBlockSizePerPadB"] == -1:
       auto_LdsBlockSizePerPadB_for_mix = 1
-    state["LdsBlockSizePerPadA"], state["LdsBlockSizePerPadB"] = calcLdsBlockSizePerPad(-1) # for MX datatypes, the lrvw argument is ignored 
+    state["LdsBlockSizePerPadA"], state["LdsBlockSizePerPadB"] = calcLdsBlockSizePerPad(state["LocalReadVectorWidth"])
 
     if state["LdsBlockSizePerPadMetadata"] == -1:
       state["LdsBlockSizePerPadMetadata"] = state["LdsBlockSizePerPadA"]