[llvm] [AMDGPU] LRO: allow same-BB non-lookthrough users for PHI (PR #160909)

Mon Sep 29 03:24:46 PDT 2025

https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/160909

>From 1b2cda771bfb3881e74745f464451d14934884db Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 26 Sep 2025 10:10:54 -0500
Subject: [PATCH 1/2] [AMDGPU] LRO: allow same-BB non-lookthrough users for PHI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loop headers frequently consume the loop-carried value in the header
block via non-lookthrough ops (e.g. byte-wise vector binops). LRO’s
same-BB filter currently prunes these users, so the loop-carried PHI
is not coerced to i32 and the intended packed form is lost.

Relax the filter: when the def is a PHI, allow same-BB non-lookthrough
users. Also fix the check to look at the user (CII) rather than the
def (II) so the walk does not terminate prematurely.
---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..7504f1a8cea09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -150,7 +150,10 @@ class LiveRegOptimizer {
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Same-BB filter must look at the *user*; and allow non-lookthrough
+      // users when the def is a PHI (loop-header pattern).
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))

>From 70d9547ce67da9d19050a3aba68ae30c370e2210 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 26 Sep 2025 11:21:27 -0500
Subject: [PATCH 2/2] Added lit test

---
 .../lro-phi-samebb-nonlookthrough-store.ll    | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll

diff --git a/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll
new file mode 100644
index 0000000000000..b508f739e7fd3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Goal: With a loop-header PHI in illegal vector type and a same-BB
+; non-lookthrough user (vector add) in the header, LRO should still coerce
+; the PHI to i32 because a profitable sink (store) exists across BB.
+
+define amdgpu_kernel void @phi_samebb_nonlookthrough_store(
+    ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) {
+; CHECK-LABEL: @phi_samebb_nonlookthrough_store(
+entry:
+  br label %loop
+
+loop:                                             ; preds = %entry, %loop
+  ; Loop-carried PHI in illegal vector type.
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Same-BB non-lookthrough use in header.
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Make it a real loop: either iterate or exit to the sink block.
+  br i1 %exit, label %store, label %loop
+
+store:                                            ; preds = %loop
+  ; The across-BB sink: storing the PHI coerced to i32.
+  %acc.bc = bitcast <4 x i8> %acc to i32
+  store i32 %acc.bc, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; After AMDGPULateCodeGenPrepare we expect:
+;  - PHI is coerced to i32
+;  - A header bitcast materializes for the add
+; This proves the same-BB non-lookthrough user (add) did not get pruned
+; when the def is a PHI.
+
+; CHECK: loop:
+; CHECK:   %[[ACC_TC:[^ ]+]] = phi i32
+; CHECK:   %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
+; CHECK:   %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v
+; CHECK:   br i1 %exit, label %store, label %loop
+; CHECK: store:
+; CHECK:   %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
+; CHECK:   %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32
+; CHECK:   store i32 %[[ST_I32]],
+