[llvm] [AMDGPU] LiveRegOptimizer: fix PHI same-BB filter; consider i8/i16 binops on SDWA (PR #155800)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 30 05:38:09 PDT 2025
https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/155800
>From 57301a35e14dd1ee7dac102a2c57ef5c0d40966e Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Wed, 27 Aug 2025 05:42:17 -0500
Subject: [PATCH 01/10] LRO fix PHI same-BB filter; treat i8/i16 binops
profitable
Fix a bug in isCoercionProfitable where the same-block filter checked
the def (II) instead of the user (CII), pruning valid paths. Also allow
same-BB non-lookthrough users when the def is a PHI, so loop headers
can be coerced across the backedge.
Extend isOpLegal to treat 8/16-bit vector add/sub/and/or/xor as
profitable on SDWA targets (stores and intrinsics remain profitable).
This repacks loop-carried values to i32 across BBs and restores SDWA
lowering instead of scattered lshr/lshl/or sequences.
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 39 ++++++++++-
.../AMDGPU/lro-coerce-v4i8-phi-loop.ll | 67 +++++++++++++++++++
2 files changed, 104 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..e4866405c6ad4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,37 @@ class LiveRegOptimizer {
return LK.first != TargetLoweringBase::TypeLegal;
}
- bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+ bool isOpLegal(Instruction *I) {
+ if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+ return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+
+ // Any store is a profitable sink (prevents flip-flopping)
+ if (isa<StoreInst>(I))
+ return true;
+
+ // Treat small-int vector binops as profitable when SDWA is available
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
+ Type *Elt = VTy->getElementType();
+ // Treat small-int vector binops as profitable when SDWA is available.
+ // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+ if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+ switch (BO->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return true;
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ return false;
+ }
bool isCoercionProfitable(Instruction *II) {
SmallPtrSet<Instruction *, 4> CVisited;
@@ -150,7 +180,12 @@ class LiveRegOptimizer {
if (!CVisited.insert(CII).second)
continue;
- if (CII->getParent() == II->getParent() && !IsLookThru(II))
+ // Allow same-BB non-lookthrough users when the def is a PHI:
+ // loop headers frequently consume the carried value in the header block
+ // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
+ // in that common case to enable packed i32 + SDWA lowering.
+ if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+ !isa<PHINode>(II))
continue;
if (isOpLegal(CII))
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
new file mode 100644
index 0000000000000..a37aaf154520b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,67 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+; loop header (same basic block as the PHI).
+; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+; placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+entry:
+ br label %loop
+
+loop:
+ ; Loop index
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+ ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+ %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+ ; Make up four i8 values derived from %i to avoid memory noise.
+ %i0 = trunc i32 %i to i8
+ %i1i = add i32 %i, 1
+ %i1 = trunc i32 %i1i to i8
+ %i2i = add i32 %i, 2
+ %i2 = trunc i32 %i2i to i8
+ %i3i = add i32 %i, 3
+ %i3 = trunc i32 %i3i to i8
+
+ ; Pack them into <4 x i8>.
+ %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
+ %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+ %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+ %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
+
+ ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+ %acc.next = add <4 x i8> %acc, %v
+
+ ; Loop control.
+ %i.next = add i32 %i, 4
+ %cond = icmp slt i32 %i.next, %n
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="gfx90a" }
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+
>From 7e1412ff48f919e1d54aa0385df745c54490a258 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:10:50 -0500
Subject: [PATCH 02/10] Fix clang-format
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index e4866405c6ad4..910da2be89cbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -128,7 +128,8 @@ class LiveRegOptimizer {
bool isOpLegal(Instruction *I) {
if (auto *Intr = dyn_cast<IntrinsicInst>(I))
- return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+ return true; // FIXME: narrow to known native intrinsics
+ // (DOT/MFMA/tbuffer) or use TTI cost.
// Any store is a profitable sink (prevents flip-flopping)
if (isa<StoreInst>(I))
@@ -139,7 +140,8 @@ class LiveRegOptimizer {
if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
Type *Elt = VTy->getElementType();
// Treat small-int vector binops as profitable when SDWA is available.
- // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+ // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
+ // tight.
if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
switch (BO->getOpcode()) {
case Instruction::Add:
@@ -182,8 +184,8 @@ class LiveRegOptimizer {
// Allow same-BB non-lookthrough users when the def is a PHI:
// loop headers frequently consume the carried value in the header block
- // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
- // in that common case to enable packed i32 + SDWA lowering.
+ // (e.g. byte-wise vector binops). We *do* want to coerce across the
+ // backedge in that common case to enable packed i32 + SDWA lowering.
if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
!isa<PHINode>(II))
continue;
>From 555dadacbcd91bbafeea2b898b230108a147485b Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:17:28 -0500
Subject: [PATCH 03/10] Fix undef in test
---
llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index a37aaf154520b..f880b8d7d20b3 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -37,10 +37,10 @@ loop:
%i3 = trunc i32 %i3i to i8
; Pack them into <4 x i8>.
- %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
- %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
- %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
- %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
+ %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
+ %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+ %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+ %v = insertelement <4 x i8> %v03, i8 %i3, i32 3
; Byte-wise add in the same block as the PHI (this must make coercion profitable).
%acc.next = add <4 x i8> %acc, %v
>From 58630e37506ae70fbcb9c7aeecd7a48751ac5ed7 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:30:14 -0500
Subject: [PATCH 04/10] Fix reviewer comments in test
---
llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index f880b8d7d20b3..dd534eb063315 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -14,9 +14,7 @@
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
-target triple = "amdgcn-amd-amdhsa"
-
-define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
entry:
br label %loop
@@ -54,8 +52,6 @@ exit:
ret void
}
-attributes #0 = { "target-cpu"="gfx90a" }
-
; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
; CHECK: loop:
; CHECK: %i = phi i32
>From e49e80484de3d4985e2f2e1859dcf42ab0fcc532 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 06:32:53 -0500
Subject: [PATCH 05/10] Fixed duplicate comment
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 910da2be89cbe..65ae2060a7dd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -135,7 +135,6 @@ class LiveRegOptimizer {
if (isa<StoreInst>(I))
return true;
- // Treat small-int vector binops as profitable when SDWA is available
if (auto *BO = dyn_cast<BinaryOperator>(I)) {
if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
Type *Elt = VTy->getElementType();
>From 34bc9a5a5e99b48cf9c9adebc36f6ff0fb493c23 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 08:31:33 -0500
Subject: [PATCH 06/10] Fix for Werrors
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 65ae2060a7dd2..fb19b5aa5a210 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -127,7 +127,7 @@ class LiveRegOptimizer {
}
bool isOpLegal(Instruction *I) {
- if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+ if (dyn_cast<IntrinsicInst>(I))
return true; // FIXME: narrow to known native intrinsics
// (DOT/MFMA/tbuffer) or use TTI cost.
@@ -141,7 +141,7 @@ class LiveRegOptimizer {
// Treat small-int vector binops as profitable when SDWA is available.
// We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
// tight.
- if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+ if (Elt->isIntegerTy(8) || (Elt->isIntegerTy(16) && ST.hasSDWA())) {
switch (BO->getOpcode()) {
case Instruction::Add:
case Instruction::Sub:
>From 10e5f32863eba6921c0aee987c2f98f5e8a1b943 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 08:38:37 -0500
Subject: [PATCH 07/10] Require SDWA for both i8 and i16, and keep vectors
within 32 bits
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index fb19b5aa5a210..1c692f9600f80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -141,7 +141,10 @@ class LiveRegOptimizer {
// Treat small-int vector binops as profitable when SDWA is available.
// We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
// tight.
- if (Elt->isIntegerTy(8) || (Elt->isIntegerTy(16) && ST.hasSDWA())) {
+ // Require SDWA for both i8 and i16, and keep vectors within 32 bits.
+ std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
+ if (ST.hasSDWA() && Bits && Bits->get() <= 32 &&
+ (Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
switch (BO->getOpcode()) {
case Instruction::Add:
case Instruction::Sub:
>From 63cd09ef93d1ff48797c5c153e6824e18887b874 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 09:09:40 -0500
Subject: [PATCH 08/10] Fix compilation error
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 1c692f9600f80..af23e206e170a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -143,7 +143,7 @@ class LiveRegOptimizer {
// tight.
// Require SDWA for both i8 and i16, and keep vectors within 32 bits.
std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
- if (ST.hasSDWA() && Bits && Bits->get() <= 32 &&
+ if (ST.hasSDWA() && Bits && *Bits <= 32 &&
(Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
switch (BO->getOpcode()) {
case Instruction::Add:
>From a5f189c17e41a21138c4aed628ec42d1068a951e Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 26 Sep 2025 09:56:32 -0500
Subject: [PATCH 09/10] Use bit-width * NumElements
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 44 ++++++++++---------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index af23e206e170a..ed3b41e668b86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,34 +126,36 @@ class LiveRegOptimizer {
return LK.first != TargetLoweringBase::TypeLegal;
}
- bool isOpLegal(Instruction *I) {
+ bool isOpLegal(const Instruction *I) {
if (dyn_cast<IntrinsicInst>(I))
- return true; // FIXME: narrow to known native intrinsics
- // (DOT/MFMA/tbuffer) or use TTI cost.
+ return true;
+/* if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+ Intrinsic::ID ID = II->getIntrinsicID();
+ if (Intrinsic::isTargetIntrinsic(ID))
+ return true; // FIXME: optionally narrow to specific amdgcn intrinsics
+ }*/
// Any store is a profitable sink (prevents flip-flopping)
if (isa<StoreInst>(I))
return true;
if (auto *BO = dyn_cast<BinaryOperator>(I)) {
- if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
- Type *Elt = VTy->getElementType();
- // Treat small-int vector binops as profitable when SDWA is available.
- // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
- // tight.
- // Require SDWA for both i8 and i16, and keep vectors within 32 bits.
- std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
- if (ST.hasSDWA() && Bits && *Bits <= 32 &&
- (Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
- switch (BO->getOpcode()) {
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- return true;
- default:
- break;
+ if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
+ if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
+ unsigned EB = IT->getBitWidth();
+ unsigned EC = VT->getNumElements();
+ // Check for SDWA-compatible operation
+ if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
+ switch (BO->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return true;
+ default:
+ break;
+ }
}
}
}
>From fbda8587e4efd413ebd184be6a68b7ae23f2c52f Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Tue, 30 Sep 2025 07:34:51 -0500
Subject: [PATCH 10/10] Removed phi-node part (merged with PR#160909)
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 6800578797be4..1c86778935e42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -127,13 +127,8 @@ class LiveRegOptimizer {
}
bool isOpLegal(const Instruction *I) {
- if (dyn_cast<IntrinsicInst>(I))
- return true;
-/* if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
- Intrinsic::ID ID = II->getIntrinsicID();
- if (Intrinsic::isTargetIntrinsic(ID))
- return true; // FIXME: optionally narrow to specific amdgcn intrinsics
- }*/
+ if (isa<IntrinsicInst>(I))
+ return true;
// Any store is a profitable sink (prevents flip-flopping)
if (isa<StoreInst>(I))
More information about the llvm-commits
mailing list