[llvm] [AMDGPU][LRO] LRO fix PHI same-BB filter; treat i8/i16 binops as profitable (PR #155800)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 28 02:28:05 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (michaelselehov)
<details>
<summary>Changes</summary>
Fix a bug in `isCoercionProfitable` where the same-block filter checked the def (II) instead of the user (CII), pruning valid paths. Also allow same-BB non-lookthrough users when the def is a PHI, so loop headers can be coerced across the backedge.
Extend `isOpLegal` to treat 8/16-bit vector add/sub/and/or/xor as profitable on SDWA targets (stores and intrinsics remain profitable). This repacks loop-carried values to i32 across BBs and restores SDWA lowering instead of scattered lshr/lshl/or sequences.
Testing:
- Local: `check-llvm-codegen-amdgpu` is green (4314/4320 passed, 6 XFAIL).
- Additional: validated in AMD internal CI
---
Full diff: https://github.com/llvm/llvm-project/pull/155800.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (+37-2)
- (added) llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll (+67)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..e4866405c6ad4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,37 @@ class LiveRegOptimizer {
return LK.first != TargetLoweringBase::TypeLegal;
}
- bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+ bool isOpLegal(Instruction *I) {
+ if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+ return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+
+ // Any store is a profitable sink (prevents flip-flopping)
+ if (isa<StoreInst>(I))
+ return true;
+
+ // Treat small-int vector binops as profitable when SDWA is available
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
+ Type *Elt = VTy->getElementType();
+ // Treat small-int vector binops as profitable when SDWA is available.
+ // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+ if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+ switch (BO->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return true;
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ return false;
+ }
bool isCoercionProfitable(Instruction *II) {
SmallPtrSet<Instruction *, 4> CVisited;
@@ -150,7 +180,12 @@ class LiveRegOptimizer {
if (!CVisited.insert(CII).second)
continue;
- if (CII->getParent() == II->getParent() && !IsLookThru(II))
+ // Allow same-BB non-lookthrough users when the def is a PHI:
+ // loop headers frequently consume the carried value in the header block
+ // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
+ // in that common case to enable packed i32 + SDWA lowering.
+ if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+ !isa<PHINode>(II))
continue;
if (isOpLegal(CII))
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
new file mode 100644
index 0000000000000..a37aaf154520b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,67 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+; loop header (same basic block as the PHI).
+; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+; placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+entry:
+ br label %loop
+
+loop:
+ ; Loop index
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+ ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+ %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+ ; Make up four i8 values derived from %i to avoid memory noise.
+ %i0 = trunc i32 %i to i8
+ %i1i = add i32 %i, 1
+ %i1 = trunc i32 %i1i to i8
+ %i2i = add i32 %i, 2
+ %i2 = trunc i32 %i2i to i8
+ %i3i = add i32 %i, 3
+ %i3 = trunc i32 %i3i to i8
+
+ ; Pack them into <4 x i8>.
+ %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
+ %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+ %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+ %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
+
+ ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+ %acc.next = add <4 x i8> %acc, %v
+
+ ; Loop control.
+ %i.next = add i32 %i, 4
+ %cond = icmp slt i32 %i.next, %n
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="gfx90a" }
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+
``````````
</details>
https://github.com/llvm/llvm-project/pull/155800
More information about the llvm-commits
mailing list