[llvm] [AMDGPU] LiveRegOptimizer: fix PHI same-BB filter; consider i8/i16 binops on SDWA (PR #155800)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 04:34:01 PDT 2025
https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/155800
>From 57301a35e14dd1ee7dac102a2c57ef5c0d40966e Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Wed, 27 Aug 2025 05:42:17 -0500
Subject: [PATCH 1/5] LRO fix PHI same-BB filter; treat i8/i16 binops
profitable
Fix a bug in isCoercionProfitable where the same-block filter checked
the def (II) instead of the user (CII), pruning valid paths. Also allow
same-BB non-lookthrough users when the def is a PHI, so loop headers
can be coerced across the backedge.
Extend isOpLegal to treat 8/16-bit vector add/sub/and/or/xor as
profitable on SDWA targets (stores and intrinsics remain profitable).
This repacks loop-carried values to i32 across BBs and restores SDWA
lowering instead of scattered lshr/lshl/or sequences.
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 39 ++++++++++-
.../AMDGPU/lro-coerce-v4i8-phi-loop.ll | 67 +++++++++++++++++++
2 files changed, 104 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..e4866405c6ad4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,37 @@ class LiveRegOptimizer {
return LK.first != TargetLoweringBase::TypeLegal;
}
- bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+ bool isOpLegal(Instruction *I) {
+ if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+ return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+
+ // Any store is a profitable sink (prevents flip-flopping)
+ if (isa<StoreInst>(I))
+ return true;
+
+ // Treat small-int vector binops as profitable when SDWA is available
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
+ Type *Elt = VTy->getElementType();
+ // Treat small-int vector binops as profitable when SDWA is available.
+ // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+ if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+ switch (BO->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return true;
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ return false;
+ }
bool isCoercionProfitable(Instruction *II) {
SmallPtrSet<Instruction *, 4> CVisited;
@@ -150,7 +180,12 @@ class LiveRegOptimizer {
if (!CVisited.insert(CII).second)
continue;
- if (CII->getParent() == II->getParent() && !IsLookThru(II))
+ // Allow same-BB non-lookthrough users when the def is a PHI:
+ // loop headers frequently consume the carried value in the header block
+ // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
+ // in that common case to enable packed i32 + SDWA lowering.
+ if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+ !isa<PHINode>(II))
continue;
if (isOpLegal(CII))
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
new file mode 100644
index 0000000000000..a37aaf154520b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,67 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+; loop header (same basic block as the PHI).
+; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+; placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+entry:
+ br label %loop
+
+loop:
+ ; Loop index
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+ ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+ %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+ ; Make up four i8 values derived from %i to avoid memory noise.
+ %i0 = trunc i32 %i to i8
+ %i1i = add i32 %i, 1
+ %i1 = trunc i32 %i1i to i8
+ %i2i = add i32 %i, 2
+ %i2 = trunc i32 %i2i to i8
+ %i3i = add i32 %i, 3
+ %i3 = trunc i32 %i3i to i8
+
+ ; Pack them into <4 x i8>.
+ %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
+ %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+ %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+ %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
+
+ ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+ %acc.next = add <4 x i8> %acc, %v
+
+ ; Loop control.
+ %i.next = add i32 %i, 4
+ %cond = icmp slt i32 %i.next, %n
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="gfx90a" }
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+
>From 7e1412ff48f919e1d54aa0385df745c54490a258 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:10:50 -0500
Subject: [PATCH 2/5] Fix clang-format
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index e4866405c6ad4..910da2be89cbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -128,7 +128,8 @@ class LiveRegOptimizer {
bool isOpLegal(Instruction *I) {
if (auto *Intr = dyn_cast<IntrinsicInst>(I))
- return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+ return true; // FIXME: narrow to known native intrinsics
+ // (DOT/MFMA/tbuffer) or use TTI cost.
// Any store is a profitable sink (prevents flip-flopping)
if (isa<StoreInst>(I))
@@ -139,7 +140,8 @@ class LiveRegOptimizer {
if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
Type *Elt = VTy->getElementType();
// Treat small-int vector binops as profitable when SDWA is available.
- // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+ // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
+ // tight.
if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
switch (BO->getOpcode()) {
case Instruction::Add:
@@ -182,8 +184,8 @@ class LiveRegOptimizer {
// Allow same-BB non-lookthrough users when the def is a PHI:
// loop headers frequently consume the carried value in the header block
- // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
- // in that common case to enable packed i32 + SDWA lowering.
+ // (e.g. byte-wise vector binops). We *do* want to coerce across the
+ // backedge in that common case to enable packed i32 + SDWA lowering.
if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
!isa<PHINode>(II))
continue;
>From 555dadacbcd91bbafeea2b898b230108a147485b Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:17:28 -0500
Subject: [PATCH 3/5] Fix undef in test
---
llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index a37aaf154520b..f880b8d7d20b3 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -37,10 +37,10 @@ loop:
%i3 = trunc i32 %i3i to i8
; Pack them into <4 x i8>.
- %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
- %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
- %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
- %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
+ %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
+ %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+ %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+ %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
; Byte-wise add in the same block as the PHI (this must make coercion profitable).
%acc.next = add <4 x i8> %acc, %v
>From 58630e37506ae70fbcb9c7aeecd7a48751ac5ed7 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:30:14 -0500
Subject: [PATCH 4/5] Fix reviewer comments in test
---
llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index f880b8d7d20b3..dd534eb063315 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -14,9 +14,7 @@
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
-target triple = "amdgcn-amd-amdhsa"
-
-define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
entry:
br label %loop
@@ -54,8 +52,6 @@ exit:
ret void
}
-attributes #0 = { "target-cpu"="gfx90a" }
-
; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
; CHECK: loop:
; CHECK: %i = phi i32
>From e49e80484de3d4985e2f2e1859dcf42ab0fcc532 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 06:32:53 -0500
Subject: [PATCH 5/5] Fixed duplicate comment
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 910da2be89cbe..65ae2060a7dd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -135,7 +135,6 @@ class LiveRegOptimizer {
if (isa<StoreInst>(I))
return true;
- // Treat small-int vector binops as profitable when SDWA is available
if (auto *BO = dyn_cast<BinaryOperator>(I)) {
if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
Type *Elt = VTy->getElementType();
More information about the llvm-commits
mailing list