[llvm] [AMDGPU] Improve uniform argument handling in InstCombineIntrinsic (PR #105812)

Fri Aug 23 04:16:28 PDT 2024

https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/105812

Common up handling of intrinsics that are a no-op on uniform arguments.
This catches a couple of new cases:

readlane (readlane x, y), z -> readlane x, y
(for any z, does not have to equal y).

permlane64 (readfirstlane x) -> readfirstlane x
(and likewise for any other uniform argument to permlane64).


>From afd6438094ff0cf7a34aa8b882a033bc1f285a60 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 23 Aug 2024 12:04:19 +0100
Subject: [PATCH] [AMDGPU] Improve uniform argument handling in
 InstCombineIntrinsic

Common up handling of intrinsics that are a no-op on uniform arguments.
This catches a couple of new cases:

readlane (readlane x, y), z -> readlane x, y
(for any z, does not have to equal y).

permlane64 (readfirstlane x) -> readfirstlane x
(and likewise for any other uniform argument to permlane64).
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 58 +++++++------------
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   | 19 +++++-
 2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 9197404309663a..58d1cfae4a10e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -440,6 +440,22 @@ static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
          SqrtOp->getType()->isHalfTy();
 }
 
+/// Return true if we can easily prove that use U is uniform.
+static bool isTriviallyUniform(const Use &U) {
+  Value *V = U.get();
+  if (isa<Constant>(V))
+    return true;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    // If I and U are in different blocks then there is a possibility of
+    // temporal divergence.
+    if (I->getParent() != cast<Instruction>(U.getUser())->getParent())
+      return false;
+    if (const auto *II = dyn_cast<IntrinsicInst>(I))
+      return AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID());
+  }
+  return false;
+}
+
 std::optional<Instruction *>
 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   Intrinsic::ID IID = II.getIntrinsicID();
@@ -1060,46 +1076,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
   }
   case Intrinsic::amdgcn_permlane64:
-    // A constant value is trivially uniform.
-    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
-      return IC.replaceInstUsesWith(II, C);
-    }
-    break;
   case Intrinsic::amdgcn_readfirstlane:
   case Intrinsic::amdgcn_readlane: {
-    // A constant value is trivially uniform.
-    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
-      return IC.replaceInstUsesWith(II, C);
-    }
-
-    // The rest of these may not be safe if the exec may not be the same between
-    // the def and use.
-    Value *Src = II.getArgOperand(0);
-    Instruction *SrcInst = dyn_cast<Instruction>(Src);
-    if (SrcInst && SrcInst->getParent() != II.getParent())
-      break;
-
-    // readfirstlane (readfirstlane x) -> readfirstlane x
-    // readlane (readfirstlane x), y -> readfirstlane x
-    if (match(Src,
-              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
-      return IC.replaceInstUsesWith(II, Src);
-    }
-
-    if (IID == Intrinsic::amdgcn_readfirstlane) {
-      // readfirstlane (readlane x, y) -> readlane x, y
-      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
-        return IC.replaceInstUsesWith(II, Src);
-      }
-    } else {
-      // readlane (readlane x, y), y -> readlane x, y
-      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
-                         PatternMatch::m_Value(),
-                         PatternMatch::m_Specific(II.getArgOperand(1))))) {
-        return IC.replaceInstUsesWith(II, Src);
-      }
-    }
-
+    // If the first argument is uniform these intrinsics return it unchanged.
+    const Use &Src = II.getArgOperandUse(0);
+    if (isTriviallyUniform(Src))
+      return IC.replaceInstUsesWith(II, Src.get());
     break;
   }
   case Intrinsic::amdgcn_trig_preop: {
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 9cb79b26448658..f3a3b8c1dc5d8a 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2888,8 +2888,7 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
 ; CHECK-LABEL: @readlane_idempotent_different_lanes(
 ; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
@@ -3061,6 +3060,22 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1)
   ret void
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.permlane64
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0) {
+; CHECK-LABEL: @permlane64_uniform(
+; CHECK-NEXT:    [[SRC1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0:%.*]])
+; CHECK-NEXT:    store i32 [[SRC1]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %src1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
+  %res = call i32 @llvm.amdgcn.permlane64(i32 %src1)
+  store i32 %res, ptr addrspace(1) %out
+  ret void
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample a16
 ; --------------------------------------------------------------------