[llvm] [AMDGPU] LiveRegOptimizer: fix PHI same-BB filter; consider i8/i16 binops on SDWA (PR #155800)

Thu Sep 4 04:34:01 PDT 2025

https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/155800

>From 57301a35e14dd1ee7dac102a2c57ef5c0d40966e Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Wed, 27 Aug 2025 05:42:17 -0500
Subject: [PATCH 1/5] LRO fix PHI same-BB filter; treat i8/i16 binops
 profitable

Fix a bug in isCoercionProfitable where the same-block filter checked
the def (II) instead of the user (CII), pruning valid paths. Also allow
same-BB non-lookthrough users when the def is a PHI, so loop headers
can be coerced across the backedge.

Extend isOpLegal to treat 8/16-bit vector add/sub/and/or/xor as
profitable on SDWA targets (stores and intrinsics remain profitable).
This repacks loop-carried values to i32 across BBs and restores SDWA
lowering instead of scattered lshr/lshl/or sequences.
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 39 ++++++++++-
 .../AMDGPU/lro-coerce-v4i8-phi-loop.ll        | 67 +++++++++++++++++++
 2 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..e4866405c6ad4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,37 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+  bool isOpLegal(Instruction *I) {
+    if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+      return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+
+    // Any store is a profitable sink (prevents flip-flopping)
+    if (isa<StoreInst>(I))
+      return true;
+
+    // Treat small-int vector binops as profitable when SDWA is available
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
+        Type *Elt = VTy->getElementType();
+        // Treat small-int vector binops as profitable when SDWA is available.
+        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+        if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+          switch (BO->getOpcode()) {
+          case Instruction::Add:
+          case Instruction::Sub:
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            return true;
+          default:
+            break;
+          }
+        }
+      }
+    }
+
+    return false;
+  }
 
   bool isCoercionProfitable(Instruction *II) {
     SmallPtrSet<Instruction *, 4> CVisited;
@@ -150,7 +180,12 @@ class LiveRegOptimizer {
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Allow same-BB non-lookthrough users when the def is a PHI:
+      // loop headers frequently consume the carried value in the header block
+      // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
+      // in that common case to enable packed i32 + SDWA lowering.
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
new file mode 100644
index 0000000000000..a37aaf154520b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,67 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+;  - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+;    loop header (same basic block as the PHI).
+;  - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+;    the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+;    placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+;  - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+entry:
+  br label %loop
+
+loop:
+  ; Loop index
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Make up four i8 values derived from %i to avoid memory noise.
+  %i0 = trunc i32 %i to i8
+  %i1i = add i32 %i, 1
+  %i1 = trunc i32 %i1i to i8
+  %i2i = add i32 %i, 2
+  %i2 = trunc i32 %i2i to i8
+  %i3i = add i32 %i, 3
+  %i3 = trunc i32 %i3i to i8
+
+  ; Pack them into <4 x i8>.
+  %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
+  %v02 = insertelement <4 x i8> %v01,  i8 %i1, i32 1
+  %v03 = insertelement <4 x i8> %v02,  i8 %i2, i32 2
+  %v   = insertelement <4 x i8> %v03,  i8 %i3, i32 3
+
+  ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Loop control.
+  %i.next = add i32 %i, 4
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="gfx90a" }
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+

>From 7e1412ff48f919e1d54aa0385df745c54490a258 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:10:50 -0500
Subject: [PATCH 2/5] Fix clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index e4866405c6ad4..910da2be89cbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -128,7 +128,8 @@ class LiveRegOptimizer {
 
   bool isOpLegal(Instruction *I) {
     if (auto *Intr = dyn_cast<IntrinsicInst>(I))
-      return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+      return true; // FIXME: narrow to known native intrinsics
+                   // (DOT/MFMA/tbuffer) or use TTI cost.
 
     // Any store is a profitable sink (prevents flip-flopping)
     if (isa<StoreInst>(I))
@@ -139,7 +140,8 @@ class LiveRegOptimizer {
       if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
         Type *Elt = VTy->getElementType();
         // Treat small-int vector binops as profitable when SDWA is available.
-        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
+        // tight.
         if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
           switch (BO->getOpcode()) {
           case Instruction::Add:
@@ -182,8 +184,8 @@ class LiveRegOptimizer {
 
       // Allow same-BB non-lookthrough users when the def is a PHI:
       // loop headers frequently consume the carried value in the header block
-      // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
-      // in that common case to enable packed i32 + SDWA lowering.
+      // (e.g. byte-wise vector binops). We *do* want to coerce across the
+      // backedge in that common case to enable packed i32 + SDWA lowering.
       if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
           !isa<PHINode>(II))
         continue;

>From 555dadacbcd91bbafeea2b898b230108a147485b Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:17:28 -0500
Subject: [PATCH 3/5] Fix undef in test

---
 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index a37aaf154520b..f880b8d7d20b3 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -37,10 +37,10 @@ loop:
   %i3 = trunc i32 %i3i to i8
 
   ; Pack them into <4 x i8>.
-  %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
-  %v02 = insertelement <4 x i8> %v01,  i8 %i1, i32 1
-  %v03 = insertelement <4 x i8> %v02,  i8 %i2, i32 2
-  %v   = insertelement <4 x i8> %v03,  i8 %i3, i32 3
+  %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
+  %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+  %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+  %v   = insertelement <4 x i8> %v03, i8 %i3, i32 3
 
   ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
   %acc.next = add <4 x i8> %acc, %v

>From 58630e37506ae70fbcb9c7aeecd7a48751ac5ed7 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:30:14 -0500
Subject: [PATCH 4/5] Fix reviewer comments in test

---
 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index f880b8d7d20b3..dd534eb063315 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -14,9 +14,7 @@
 ;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
 ;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
 
-target triple = "amdgcn-amd-amdhsa"
-
-define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
 entry:
   br label %loop
 
@@ -54,8 +52,6 @@ exit:
   ret void
 }
 
-attributes #0 = { "target-cpu"="gfx90a" }
-
 ; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
 ; CHECK: loop:
 ; CHECK: %i = phi i32

>From e49e80484de3d4985e2f2e1859dcf42ab0fcc532 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 06:32:53 -0500
Subject: [PATCH 5/5] Fixed duplicate comment

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 910da2be89cbe..65ae2060a7dd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -135,7 +135,6 @@ class LiveRegOptimizer {
     if (isa<StoreInst>(I))
       return true;
 
-    // Treat small-int vector binops as profitable when SDWA is available
     if (auto *BO = dyn_cast<BinaryOperator>(I)) {
       if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
         Type *Elt = VTy->getElementType();