[llvm] [AMDGPU] LiveRegOptimizer: fix PHI same-BB filter; consider i8/i16 binops on SDWA (PR #155800)

Tue Sep 30 05:38:09 PDT 2025

https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/155800

>From 57301a35e14dd1ee7dac102a2c57ef5c0d40966e Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Wed, 27 Aug 2025 05:42:17 -0500
Subject: [PATCH 01/10] LRO fix PHI same-BB filter; treat i8/i16 binops
 profitable

Fix a bug in isCoercionProfitable where the same-block filter checked
the def (II) instead of the user (CII), pruning valid paths. Also allow
same-BB non-lookthrough users when the def is a PHI, so loop headers
can be coerced across the backedge.

Extend isOpLegal to treat 8/16-bit vector add/sub/and/or/xor as
profitable on SDWA targets (stores and intrinsics remain profitable).
This repacks loop-carried values to i32 across BBs and restores SDWA
lowering instead of scattered lshr/lshl/or sequences.
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 39 ++++++++++-
 .../AMDGPU/lro-coerce-v4i8-phi-loop.ll        | 67 +++++++++++++++++++
 2 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..e4866405c6ad4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,37 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+  bool isOpLegal(Instruction *I) {
+    if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+      return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+
+    // Any store is a profitable sink (prevents flip-flopping)
+    if (isa<StoreInst>(I))
+      return true;
+
+    // Treat small-int vector binops as profitable when SDWA is available
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
+        Type *Elt = VTy->getElementType();
+        // Treat small-int vector binops as profitable when SDWA is available.
+        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+        if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+          switch (BO->getOpcode()) {
+          case Instruction::Add:
+          case Instruction::Sub:
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            return true;
+          default:
+            break;
+          }
+        }
+      }
+    }
+
+    return false;
+  }
 
   bool isCoercionProfitable(Instruction *II) {
     SmallPtrSet<Instruction *, 4> CVisited;
@@ -150,7 +180,12 @@ class LiveRegOptimizer {
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Allow same-BB non-lookthrough users when the def is a PHI:
+      // loop headers frequently consume the carried value in the header block
+      // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
+      // in that common case to enable packed i32 + SDWA lowering.
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))
diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
new file mode 100644
index 0000000000000..a37aaf154520b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,67 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+;  - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+;    loop header (same basic block as the PHI).
+;  - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+;    the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+;    placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+;  - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+entry:
+  br label %loop
+
+loop:
+  ; Loop index
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Make up four i8 values derived from %i to avoid memory noise.
+  %i0 = trunc i32 %i to i8
+  %i1i = add i32 %i, 1
+  %i1 = trunc i32 %i1i to i8
+  %i2i = add i32 %i, 2
+  %i2 = trunc i32 %i2i to i8
+  %i3i = add i32 %i, 3
+  %i3 = trunc i32 %i3i to i8
+
+  ; Pack them into <4 x i8>.
+  %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
+  %v02 = insertelement <4 x i8> %v01,  i8 %i1, i32 1
+  %v03 = insertelement <4 x i8> %v02,  i8 %i2, i32 2
+  %v   = insertelement <4 x i8> %v03,  i8 %i3, i32 3
+
+  ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Loop control.
+  %i.next = add i32 %i, 4
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="gfx90a" }
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+

>From 7e1412ff48f919e1d54aa0385df745c54490a258 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:10:50 -0500
Subject: [PATCH 02/10] Fix clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index e4866405c6ad4..910da2be89cbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -128,7 +128,8 @@ class LiveRegOptimizer {
 
   bool isOpLegal(Instruction *I) {
     if (auto *Intr = dyn_cast<IntrinsicInst>(I))
-      return true; // FIXME: narrow to known native intrinsics (DOT/MFMA/tbuffer) or use TTI cost.
+      return true; // FIXME: narrow to known native intrinsics
+                   // (DOT/MFMA/tbuffer) or use TTI cost.
 
     // Any store is a profitable sink (prevents flip-flopping)
     if (isa<StoreInst>(I))
@@ -139,7 +140,8 @@ class LiveRegOptimizer {
       if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
         Type *Elt = VTy->getElementType();
         // Treat small-int vector binops as profitable when SDWA is available.
-        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior tight.
+        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
+        // tight.
         if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
           switch (BO->getOpcode()) {
           case Instruction::Add:
@@ -182,8 +184,8 @@ class LiveRegOptimizer {
 
       // Allow same-BB non-lookthrough users when the def is a PHI:
       // loop headers frequently consume the carried value in the header block
-      // (e.g. byte-wise vector binops). We *do* want to coerce across the backedge
-      // in that common case to enable packed i32 + SDWA lowering.
+      // (e.g. byte-wise vector binops). We *do* want to coerce across the
+      // backedge in that common case to enable packed i32 + SDWA lowering.
       if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
           !isa<PHINode>(II))
         continue;

>From 555dadacbcd91bbafeea2b898b230108a147485b Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:17:28 -0500
Subject: [PATCH 03/10] Fix undef in test

---
 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index a37aaf154520b..f880b8d7d20b3 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -37,10 +37,10 @@ loop:
   %i3 = trunc i32 %i3i to i8
 
   ; Pack them into <4 x i8>.
-  %v01 = insertelement <4 x i8> undef, i8 %i0, i32 0
-  %v02 = insertelement <4 x i8> %v01,  i8 %i1, i32 1
-  %v03 = insertelement <4 x i8> %v02,  i8 %i2, i32 2
-  %v   = insertelement <4 x i8> %v03,  i8 %i3, i32 3
+  %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
+  %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+  %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+  %v   = insertelement <4 x i8> %v03, i8 %i3, i32 3
 
   ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
   %acc.next = add <4 x i8> %acc, %v

>From 58630e37506ae70fbcb9c7aeecd7a48751ac5ed7 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 28 Aug 2025 05:30:14 -0500
Subject: [PATCH 04/10] Fix reviewer comments in test

---
 llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
index f880b8d7d20b3..dd534eb063315 100644
--- a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -14,9 +14,7 @@
 ;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
 ;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
 
-target triple = "amdgcn-amd-amdhsa"
-
-define amdgpu_kernel void @lro_coerce_v4i8_phi(i8* nocapture %p, i32 %n) #0 {
+define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
 entry:
   br label %loop
 
@@ -54,8 +52,6 @@ exit:
   ret void
 }
 
-attributes #0 = { "target-cpu"="gfx90a" }
-
 ; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
 ; CHECK: loop:
 ; CHECK: %i = phi i32

>From e49e80484de3d4985e2f2e1859dcf42ab0fcc532 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 06:32:53 -0500
Subject: [PATCH 05/10] Fixed duplicate comment

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 910da2be89cbe..65ae2060a7dd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -135,7 +135,6 @@ class LiveRegOptimizer {
     if (isa<StoreInst>(I))
       return true;
 
-    // Treat small-int vector binops as profitable when SDWA is available
     if (auto *BO = dyn_cast<BinaryOperator>(I)) {
       if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
         Type *Elt = VTy->getElementType();

>From 34bc9a5a5e99b48cf9c9adebc36f6ff0fb493c23 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 08:31:33 -0500
Subject: [PATCH 06/10] Fix for Werrors

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 65ae2060a7dd2..fb19b5aa5a210 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -127,7 +127,7 @@ class LiveRegOptimizer {
   }
 
   bool isOpLegal(Instruction *I) {
-    if (auto *Intr = dyn_cast<IntrinsicInst>(I))
+    if (dyn_cast<IntrinsicInst>(I))
       return true; // FIXME: narrow to known native intrinsics
                    // (DOT/MFMA/tbuffer) or use TTI cost.
 
@@ -141,7 +141,7 @@ class LiveRegOptimizer {
         // Treat small-int vector binops as profitable when SDWA is available.
         // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
         // tight.
-        if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) {
+        if (Elt->isIntegerTy(8) || (Elt->isIntegerTy(16) && ST.hasSDWA())) {
           switch (BO->getOpcode()) {
           case Instruction::Add:
           case Instruction::Sub:

>From 10e5f32863eba6921c0aee987c2f98f5e8a1b943 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 08:38:37 -0500
Subject: [PATCH 07/10] Require SDWA for both i8 and i16, and keep vectors
 within 32 bits

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index fb19b5aa5a210..1c692f9600f80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -141,7 +141,10 @@ class LiveRegOptimizer {
         // Treat small-int vector binops as profitable when SDWA is available.
         // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
         // tight.
-        if (Elt->isIntegerTy(8) || (Elt->isIntegerTy(16) && ST.hasSDWA())) {
+        // Require SDWA for both i8 and i16, and keep vectors within 32 bits.
+        std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
+        if (ST.hasSDWA() && Bits && Bits->get() <= 32 &&
+            (Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
           switch (BO->getOpcode()) {
           case Instruction::Add:
           case Instruction::Sub:

>From 63cd09ef93d1ff48797c5c153e6824e18887b874 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 4 Sep 2025 09:09:40 -0500
Subject: [PATCH 08/10] Fix compilation error

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 1c692f9600f80..af23e206e170a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -143,7 +143,7 @@ class LiveRegOptimizer {
         // tight.
         // Require SDWA for both i8 and i16, and keep vectors within 32 bits.
         std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
-        if (ST.hasSDWA() && Bits && Bits->get() <= 32 &&
+        if (ST.hasSDWA() && Bits && *Bits <= 32 &&
             (Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
           switch (BO->getOpcode()) {
           case Instruction::Add:

>From a5f189c17e41a21138c4aed628ec42d1068a951e Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 26 Sep 2025 09:56:32 -0500
Subject: [PATCH 09/10] Use bit-width * NumElements

---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       | 44 ++++++++++---------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index af23e206e170a..ed3b41e668b86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,34 +126,36 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  bool isOpLegal(Instruction *I) {
+  bool isOpLegal(const Instruction *I) {
     if (dyn_cast<IntrinsicInst>(I))
-      return true; // FIXME: narrow to known native intrinsics
-                   // (DOT/MFMA/tbuffer) or use TTI cost.
+      return true; 
+/*    if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+      Intrinsic::ID ID = II->getIntrinsicID();
+      if (Intrinsic::isTargetIntrinsic(ID))
+        return true; // FIXME: optionally narrow to specific amdgcn intrinsics
+    }*/
 
     // Any store is a profitable sink (prevents flip-flopping)
     if (isa<StoreInst>(I))
       return true;
 
     if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-      if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
-        Type *Elt = VTy->getElementType();
-        // Treat small-int vector binops as profitable when SDWA is available.
-        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
-        // tight.
-        // Require SDWA for both i8 and i16, and keep vectors within 32 bits.
-        std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
-        if (ST.hasSDWA() && Bits && *Bits <= 32 &&
-            (Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
-          switch (BO->getOpcode()) {
-          case Instruction::Add:
-          case Instruction::Sub:
-          case Instruction::And:
-          case Instruction::Or:
-          case Instruction::Xor:
-            return true;
-          default:
-            break;
+      if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
+        if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
+          unsigned EB = IT->getBitWidth();
+          unsigned EC = VT->getNumElements();
+          // Check for SDWA-compatible operation
+          if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
+            switch (BO->getOpcode()) {
+              case Instruction::Add:
+              case Instruction::Sub:
+              case Instruction::And:
+              case Instruction::Or:
+              case Instruction::Xor:
+                return true;
+              default:
+                break;
+            }
           }
         }
       }

>From fbda8587e4efd413ebd184be6a68b7ae23f2c52f Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Tue, 30 Sep 2025 07:34:51 -0500
Subject: [PATCH 10/10] Removed phi-node part (merged with PR#160909)

---
 llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 6800578797be4..1c86778935e42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -127,13 +127,8 @@ class LiveRegOptimizer {
   }
 
   bool isOpLegal(const Instruction *I) {
-    if (dyn_cast<IntrinsicInst>(I))
-      return true; 
-/*    if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
-      Intrinsic::ID ID = II->getIntrinsicID();
-      if (Intrinsic::isTargetIntrinsic(ID))
-        return true; // FIXME: optionally narrow to specific amdgcn intrinsics
-    }*/
+    if (isa<IntrinsicInst>(I))
+      return true;
 
     // Any store is a profitable sink (prevents flip-flopping)
     if (isa<StoreInst>(I))