[llvm] [AMDGPU] Fixed folding of inline imm into dot w/o opsel (PR #73589)

Mon Nov 27 15:33:32 PST 2023

https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/73589

A splat packed constant can be folded as an inline immediate but it shall use opsel. On gfx940 this code path can be skipped due to HW bug workaround and then it may be folded w/o opsel which is a bug. Fixed.

>From eeb6fb4f75d88ee6d4d1ca73114902fe8deb0204 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 27 Nov 2023 15:17:24 -0800
Subject: [PATCH] [AMDGPU] Fixed folding of inline imm into dot w/o opsel

A splat packed constant can be folded as an inline immediate but
it shall use opsel. On gfx940 this code path can be skipped due
to HW bug workaround and then it may be folded w/o opsel which is
a bug. Fixed.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     | 4 +++-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll | 9 ++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ec0370e21dfc16..5f8e32c812292b2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -205,9 +205,11 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
   const uint64_t TSFlags = MI->getDesc().TSFlags;
   if (Fold.isImm()) {
     if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
-        (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
         AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
                                       ST->hasInv2PiInlineImm())) {
+      if (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))
+        return false; // Prevent further folding of this operand without opsel.
+
       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
       // already set.
       unsigned Opcode = MI->getOpcode();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
index 490ce706455cd6f..04c84df859b27fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940
-; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940-SDAG
+; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940-GISEL
 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
 
@@ -43,7 +43,10 @@ entry:
 
 ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_op_sel:
 ; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}}
-; GFX940: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}}{{$}}
+; GFX940-SDAG: s_mov_b32 [[K:s[0-9]+]], 0x10001
+; GFX940-SDAG: v_dot2_u32_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX940-GISEL: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001
+; GFX940-GISEL: v_dot2_u32_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
 ; GFX10:  v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot2_op_sel(
     ptr addrspace(1) %r,