[llvm] AMDGPU: Allow operand folding between loop body and its preheader (PR #137022)

Wed Apr 23 10:17:39 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Akash Dutta (akadutta)

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/137022.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+14-4) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+88-11) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+5) 
- (added) llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll (+114) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1547142a8d5c6..fa4a5014632cd 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -23,6 +23,16 @@
 #define DEBUG_TYPE "si-fold-operands"
 using namespace llvm;
 
+static cl::opt<bool> SIFoldOperandsPreheader(
+    "amdgpu-si-fold-operands-preheader",
+    cl::desc("Enables operand folding between loop body and its preheader "),
+    cl::init(true));
+
+static cl::opt<int> SIFoldOperandsPreheaderThreshold(
+    "amdgpu-si-fold-operands-preheader-threshold", cl::init(100),
+    cl::desc("Threshold for operand folding hazard check. "
+             "Defaults to 100 MIs, upper limit 10000."));
+
 namespace {
 
 struct FoldCandidate {
@@ -1168,10 +1178,10 @@ void SIFoldOperandsImpl::foldOperand(
       }
 
       if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
-        if (execMayBeModifiedBeforeUse(*MRI,
-                                       UseMI->getOperand(UseOpIdx).getReg(),
-                                       *OpToFold.getParent(),
-                                       *UseMI))
+        if (checkIfExecMayBeModifiedBeforeUseAcrossBB(
+                *MRI, UseMI->getOperand(UseOpIdx).getReg(),
+                *OpToFold.getParent(), *UseMI, SIFoldOperandsPreheader,
+                SIFoldOperandsPreheaderThreshold))
           return;
 
         // %vgpr = COPY %sgpr0
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6d54860df221..c0aeb6619c701 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9743,6 +9743,90 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
   return nullptr;
 }
 
+// helper function to checkIfExecMayBeModifiedBeforeUseAcrossBB and
+// execMayBeModifiedBeforeUse. This checks possible EXEC register modifications
+// for a straight-line sequence of instructions between BeginIterator and
+// EndIterator (both inclusive) upto a pre-defined limit MaxInstScan
+bool execMayBeModifiedBeforeUseUtil(
+    const TargetRegisterInfo *TRI,
+    const MachineInstrBundleIterator<const MachineInstr> BeginIterator,
+    const MachineInstrBundleIterator<const MachineInstr> EndIterator,
+    const int MaxInstScan) {
+
+  int NumInst = 0;
+  for (auto I = BeginIterator; I != EndIterator; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++NumInst > MaxInstScan) {
+      dbgs() << "## maxinst\n";
+      return true;
+    }
+
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+      return true;
+  }
+  return false;
+}
+
+// Variant of execMayBeModifiedBeforeUse(), where DefMI and UseMI belong to
+// different basic blocks. Current code is limited to a very simple case: DefMI
+// in the predecessor BB of the single BB loop where UseMI resides.
+bool llvm::checkIfExecMayBeModifiedBeforeUseAcrossBB(
+    const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
+    const MachineInstr &UseMI, const bool SIFoldOperandsPreheader,
+    const int SIFoldOperandsPreheaderThreshold) {
+
+  assert(MRI.isSSA() && "Must be run on SSA");
+  auto *TRI = MRI.getTargetRegisterInfo();
+  auto *DefBB = DefMI.getParent();
+  const int MaxInstScan = (SIFoldOperandsPreheaderThreshold > 10000)
+                              ? 10000
+                              : SIFoldOperandsPreheaderThreshold;
+
+  // Check whether EXEC is modified along all possible control flow between
+  // DefMI and UseMI, which may include loop backedge
+  // 1. UseBB is the only successor of DefBB
+  // 2. UseBB is a single basic block loop (only two predecessor blocks: DefBB
+  // and UseBB)
+  // 3. check if EXEC is modified
+  auto *UseBB = UseMI.getParent();
+  if (UseBB != DefBB) {
+    if (SIFoldOperandsPreheader) {
+      if (!(DefBB->isSuccessor(UseBB) && (DefBB->succ_size() == 1)))
+        return true;
+
+      if (!((UseBB->pred_size() == 2) && UseBB->isPredecessor(UseBB) &&
+            UseBB->isPredecessor(DefBB)))
+        return true;
+
+      bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
+          TRI, UseBB->begin(), UseBB->end(), MaxInstScan);
+      if (canExecBeModifiedBeforeUse)
+        return true;
+
+      // Stop scan at the end of the DEF basic block.
+      // If we are here, we know for sure that the instructions in focus are in
+      // the same basic block. Scan them to be safe.
+      canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
+          TRI, std::next(DefMI.getIterator()), DefBB->end(), MaxInstScan);
+      if (canExecBeModifiedBeforeUse)
+        return true;
+
+      return false;
+    }
+    return true;
+  } else {
+    // Stop scan at the use.
+    bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
+        TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
+    if (canExecBeModifiedBeforeUse)
+      return true;
+
+    return false;
+  }
+}
+
 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
                                       Register VReg,
                                       const MachineInstr &DefMI,
@@ -9761,17 +9845,10 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
   int NumInst = 0;
 
   // Stop scan at the use.
-  auto E = UseMI.getIterator();
-  for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
-    if (I->isDebugInstr())
-      continue;
-
-    if (++NumInst > MaxInstScan)
-      return true;
-
-    if (I->modifiesRegister(AMDGPU::EXEC, TRI))
-      return true;
-  }
+  bool canExecBeModifiedBeforeUse = execMayBeModifiedBeforeUseUtil(
+      TRI, std::next(DefMI.getIterator()), UseMI.getIterator(), MaxInstScan);
+  if (canExecBeModifiedBeforeUse)
+    return true;
 
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a3a54659d299a..f16cdbbab8192 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1531,6 +1531,11 @@ bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
                                 const MachineInstr &DefMI,
                                 const MachineInstr &UseMI);
 
+bool checkIfExecMayBeModifiedBeforeUseAcrossBB(
+    const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI,
+    const MachineInstr &UseMI, const bool SIFoldOperandsPreheader,
+    const int SIFoldOperandsPreheaderThreshold);
+
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
 /// track between blocks.
diff --git a/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll b/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll
new file mode 100644
index 0000000000000..2c0e3b8855fc4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-redundant-sgpr-vgpr-rw-across-bb.ll
@@ -0,0 +1,114 @@
+; NOTE: Do not autogenerate
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; ModuleID = '<stdin>'
+source_filename = "add.cpp"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+$main = comdat any
+
+; Function Attrs: convergent mustprogress nofree norecurse nounwind
+define protected amdgpu_kernel void @main(ptr addrspace(1) noundef %args.coerce, ptr addrspace(1) noundef %args.coerce2, ptr addrspace(1) noundef %args.coerce4, i32 noundef %args10, i32 noundef %args12) local_unnamed_addr #0 comdat {
+; GCN-LABEL: main:
+; check that non-redundant readfirstlanes are not removed
+; GCN:      v_readfirstlane_b32
+; check that all redundant readfirstlanes are removed
+; GCN-NOT:  v_readfirstlane_b32
+; GCN:      s_endpgm
+entry:
+    %0 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+    %div1 = lshr i32 %0, 6
+    %rfl1 = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %div1)
+    %sub1 = add nsw i32 %args12, 1023
+    %div2 = sdiv i32 %sub1, 1024
+    %rfl2 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %div2)
+    %cmp24.i = icmp sgt i32 %rfl2, 0
+    br i1 %cmp24.i, label %for.body.lr.ph.i, label %add.exit
+
+for.body.lr.ph.i:                                 ; preds = %entry
+    %pti1 = ptrtoint ptr addrspace(1) %args.coerce4 to i64
+    %pti2 = ptrtoint ptr addrspace(1) %args.coerce2 to i64
+    %pti3 = ptrtoint ptr addrspace(1) %args.coerce to i64
+    %lshr1 = lshr i32 %rfl1, 2
+    %wid1 = tail call noundef i32 @llvm.amdgcn.workgroup.id.x()
+    %add7 = add i32 %lshr1, %wid1
+    %mbl = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+    %mbh = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbl)
+    %lshr2 = lshr i32 %mbh, 6
+    %add8 = add i32 %add7, %lshr2
+    %sub2 = shl i32 %mbh, 2
+    %mul1 = and i32 %sub2, 252
+    %sub3 = shl i32 %rfl1, 8
+    %mul2 = and i32 %sub3, 768
+    %add1 = or disjoint i32 %mul1, %mul2
+    %add2 = shl i32 %args12, 1
+    %mul3 = mul i32 %add2, %add8
+    %add3 = add nsw i32 %add1, %mul3
+    %zext1 = zext i32 %args12 to i64
+    %sub4 = shl nuw i64 %zext1, 32
+    %sext1 = add i64 %sub4, 4611686014132420608
+    %conv1 = lshr exact i64 %sext1, 32
+    %add4 = add nuw nsw i64 %conv1, 1
+    %zext2 = zext i32 %args10 to i64
+    %tmp.sroa = add nuw nsw i64 %zext2, 4294967295
+    %sub5 = add i64 %tmp.sroa, %sub4
+    %sext2 = mul i64 %sub5, %sub4
+    %conv2 = lshr exact i64 %sext2, 32
+    %add5 = add nuw nsw i64 %add4, %conv2
+    %conv3 = trunc i64 %add5 to i32
+    %mul4 = shl i32 %conv3, 2
+    %bc1 = bitcast i64 %pti3 to <2 x i32>
+    %ee1 = extractelement <2 x i32> %bc1, i64 0
+    %ee2 = extractelement <2 x i32> %bc1, i64 1
+    %bc2 = bitcast i64 %pti2 to <2 x i32>
+    %ee3 = extractelement <2 x i32> %bc2, i64 0
+    %ee4 = extractelement <2 x i32> %bc2, i64 1
+    %bc3 = bitcast i64 %pti1 to <2 x i32>
+    %ee5 = extractelement <2 x i32> %bc3, i64 0
+    %ee6 = extractelement <2 x i32> %bc3, i64 1
+    br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
+    %loopi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc.i, %for.body.i ]
+    %tmp1 = phi i32 [ %add3, %for.body.lr.ph.i ], [ %cnt, %for.body.i ]
+    %rfl3 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee1)
+    %rfl4 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee2)
+    %rfl5 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %mul4)
+    %ie1 = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 131072>, i32 %rfl3, i64 0
+    %ie2 = insertelement <4 x i32> %ie1, i32 %rfl4, i64 1
+    %ie3 = insertelement <4 x i32> %ie2, i32 %rfl5, i64 2
+    %mul5 = shl i32 %tmp1, 2
+    %buffload1 = tail call contract noundef <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> noundef %ie3, i32 noundef %mul5, i32 noundef 0, i32 noundef 0) #6
+    %add6 = add nsw i32 %tmp1, %args12
+    %mul6 = shl i32 %add6, 2
+    %buffload2 = tail call contract noundef <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> noundef %ie3, i32 noundef %mul6, i32 noundef 0, i32 noundef 0) #6
+    %rfl6 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee3)
+    %rfl7 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee4)
+    %ie4 = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 131072>, i32 %rfl6, i64 0
+    %ie5 = insertelement <4 x i32> %ie4, i32 %rfl7, i64 1
+    %ie6 = insertelement <4 x i32> %ie5, i32 %rfl5, i64 2
+    %buffload3 = tail call contract noundef <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> noundef %ie6, i32 noundef %mul5, i32 noundef 0, i32 noundef 0) #6
+    %buffload4 = tail call contract noundef <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> noundef %ie6, i32 noundef %mul6, i32 noundef 0, i32 noundef 0) #6
+    %rfl8 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee5)
+    %rfl9 = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 %ee6)
+    %ie7 = insertelement <4 x i32> <i32 poison, i32 poison, i32 poison, i32 131072>, i32 %rfl8, i64 0
+    %ie8 = insertelement <4 x i32> %ie7, i32 %rfl9, i64 1
+    %ie9 = insertelement <4 x i32> %ie8, i32 %rfl5, i64 2
+    %vec_add1 = fadd contract <4 x float> %buffload1, %buffload3
+    %vec_add2 = fadd contract <4 x float> %buffload2, %buffload4
+    tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> noundef %vec_add1, <4 x i32> noundef %ie9, i32 noundef %mul5, i32 noundef 0, i32 noundef 0) #6
+    tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> noundef %vec_add2, <4 x i32> noundef %ie9, i32 noundef %mul6, i32 noundef 0, i32 noundef 0) #6
+    %cnt = add nsw i32 %tmp1, 1024
+    %inc.i = add nuw nsw i32 %loopi, 1
+    %exitcond.not.i = icmp eq i32 %inc.i, %rfl2
+    br i1 %exitcond.not.i, label %add.exit, label %for.body.i, !llvm.loop !6
+
+    add.exit: ; preds = %for.body.i, %entry
+    ret void
+}
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #1
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}

``````````

</details>


https://github.com/llvm/llvm-project/pull/137022