[llvm] AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 (PR #128494)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 25 23:24:40 PST 2025


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/128494

>From 0176ce4702134bcc1407f61393c3ba5da7a4b2c6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 24 Feb 2025 17:15:53 +0700
Subject: [PATCH 1/5] AMDGPU: Fold bitcasts into readfirstlane, readlane, and
 permlane64

We should handle this for all the handled readlane and dpp ops.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 16 ++++++
 .../AMDGPU/bitcast-fold-lane-ops.ll           | 52 +++++++++----------
 .../InstCombine/AMDGPU/permlane64.ll          |  6 +--
 3 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ebc00e59584ac..15d79ed884e58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1128,9 +1128,25 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         simplifyDemandedLaneMaskArg(IC, II, 1))
       return &II;
 
+    // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
+    if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
+      Value *BCSrc = BC->getOperand(0);
+
+      // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
+      if (isTypeLegal(BCSrc->getType())) {
+        SmallVector<Value *, 2> Args(II.args());
+        Args[0] = BCSrc;
+        CallInst *NewCall = IC.Builder.CreateIntrinsic(
+            II.getIntrinsicID(), {BCSrc->getType()}, Args);
+        NewCall->takeName(&II);
+        return new BitCastInst(NewCall, II.getType());
+      }
+    }
+
     return std::nullopt;
   }
   case Intrinsic::amdgcn_writelane: {
+    // TODO: Fold bitcast like readlane.
     if (simplifyDemandedLaneMaskArg(IC, II, 1))
       return &II;
     return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
index d4dae239b1e7d..490086afb51b2 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
@@ -4,8 +4,8 @@
 define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) {
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane(
 ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast float %val to i32
@@ -16,9 +16,9 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) {
 define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(float %val, ptr %use.ptr) {
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(
 ; CHECK-SAME: float [[VAL:%.*]], ptr [[USE_PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
 ; CHECK-NEXT:    store float [[VAL]], ptr [[USE_PTR]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[RESULT]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %bitcast = bitcast float %val to i32
@@ -46,9 +46,7 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_call(float %val) {
 define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) {
 ; CHECK-LABEL: define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(
 ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
-; CHECK-NEXT:    [[RESULT:%.*]] = bitcast i32 [[CALL]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
 ; CHECK-NEXT:    ret float [[RESULT]]
 ;
   %bitcast = bitcast float %val to i32
@@ -60,8 +58,8 @@ define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) {
 define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) {
 ; CHECK-LABEL: define i32 @test_bitcast_v2f16_to_i32_readfirstlane(
 ; CHECK-SAME: <2 x half> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <2 x half> [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast <2 x half> [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast <2 x half> %val to i32
@@ -72,8 +70,8 @@ define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) {
 define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) {
 ; CHECK-LABEL: define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(
 ; CHECK-SAME: <2 x bfloat> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <2 x bfloat> [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call <2 x bfloat> @llvm.amdgcn.readfirstlane.v2bf16(<2 x bfloat> [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast <2 x bfloat> [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast <2 x bfloat> %val to i32
@@ -84,8 +82,8 @@ define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) {
 define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) {
 ; CHECK-LABEL: define i64 @test_bitcast_f64_to_i64_readfirstlane(
 ; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast double [[VAL]] to i64
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast double [[RESULT1]] to i64
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %bitcast = bitcast double %val to i64
@@ -96,8 +94,8 @@ define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) {
 define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) {
 ; CHECK-LABEL: define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(
 ; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast double [[VAL]] to <2 x i32>
-; CHECK-NEXT:    [[RESULT:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast double [[RESULT1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[RESULT]]
 ;
   %bitcast = bitcast double %val to <2 x i32>
@@ -108,8 +106,8 @@ define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) {
 define i64 @test_bitcast_v4i16_to_i64_readfirstlane(<4 x i16> %val) {
 ; CHECK-LABEL: define i64 @test_bitcast_v4i16_to_i64_readfirstlane(
 ; CHECK-SAME: <4 x i16> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <4 x i16> [[VAL]] to i64
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast <4 x i16> [[RESULT1]] to i64
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %bitcast = bitcast <4 x i16> %val to i64
@@ -145,8 +143,8 @@ define i32 @test_bitcast_v8i4_to_i32_readfirstlane(<8 x i4> %val) {
 define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) {
 ; CHECK-LABEL: define float @test_bitcast_i32_to_f32_readfirstlane(
 ; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32 [[VAL]] to float
-; CHECK-NEXT:    [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast i32 [[RESULT1]] to float
 ; CHECK-NEXT:    ret float [[RESULT]]
 ;
   %bitcast = bitcast i32 %val to float
@@ -157,8 +155,8 @@ define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) {
 define i16 @test_bitcast_f16_to_i16_readfirstlane(half %val) {
 ; CHECK-LABEL: define i16 @test_bitcast_f16_to_i16_readfirstlane(
 ; CHECK-SAME: half [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast half [[VAL]] to i16
-; CHECK-NEXT:    [[RESULT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast half [[RESULT1]] to i16
 ; CHECK-NEXT:    ret i16 [[RESULT]]
 ;
   %bitcast = bitcast half %val to i16
@@ -181,8 +179,8 @@ define i16 @test_bitcast_v2i8_to_i16_readfirstlane(<2 x i8> %val) {
 define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val) {
 ; CHECK-LABEL: define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(
 ; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <16 x i32>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> [[BITCAST]])
+; CHECK-NEXT:    [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
   %bitcast = bitcast <16 x float> %val to <16 x i32>
@@ -193,8 +191,8 @@ define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val
 define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val) {
 ; CHECK-LABEL: define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(
 ; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <8 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> [[BITCAST]])
+; CHECK-NEXT:    [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <8 x i64>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %bitcast = bitcast <16 x float> %val to <8 x i64>
@@ -205,8 +203,8 @@ define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val)
 define i32 @test_bitcast_f32_to_i32_readlane(float %val, i32 inreg %lane.index) {
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane(
 ; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast float %val to i32
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
index 3908f0b778508..5dd209316d6cb 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
@@ -12,9 +12,9 @@ define i32 @test_constant() {
 
 define i32 @test_bitcast_f32_to_i32_permlane64(float %val) {
 ; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64(
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]])
-; CHECK-NEXT:    ret i32 [[RESULT]]
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]])
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    ret i32 [[BITCAST]]
 ;
   %bitcast = bitcast float %val to i32
   %result = call i32 @llvm.amdgcn.permlane64.i32(i32 %bitcast)

>From 7bd0159075b8509e77dc720d7ce75ebe50c7d6f6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 25 Feb 2025 09:09:41 +0700
Subject: [PATCH 2/5] Make sure convergence tokens are preserved

---
 .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp   | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 15d79ed884e58..12f467f21aa0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1134,12 +1134,14 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
       // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
       if (isTypeLegal(BCSrc->getType())) {
-        SmallVector<Value *, 2> Args(II.args());
-        Args[0] = BCSrc;
-        CallInst *NewCall = IC.Builder.CreateIntrinsic(
-            II.getIntrinsicID(), {BCSrc->getType()}, Args);
-        NewCall->takeName(&II);
-        return new BitCastInst(NewCall, II.getType());
+        Module *M = IC.Builder.GetInsertBlock()->getModule();
+        // Mutate the call in place to ensure operand bundles are preserved.
+        Function *Remangled =
+            Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
+
+        II.setCalledFunction(Remangled);
+        IC.replaceOperand(II, 0, BCSrc);
+        return new BitCastInst(&II, II.getType());
       }
     }
 

>From db5757a8b36d981455a54880bdc8a70f6629822d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 25 Feb 2025 09:16:00 +0700
Subject: [PATCH 3/5] Revert "Make sure convergence tokens are preserved"

This reverts commit bf1798782624bd60b22d7d866773e0cd5b5b1c75.
---
 .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp   | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 12f467f21aa0f..15d79ed884e58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1134,14 +1134,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
       // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
       if (isTypeLegal(BCSrc->getType())) {
-        Module *M = IC.Builder.GetInsertBlock()->getModule();
-        // Mutate the call in place to ensure operand bundles are preserved.
-        Function *Remangled =
-            Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
-
-        II.setCalledFunction(Remangled);
-        IC.replaceOperand(II, 0, BCSrc);
-        return new BitCastInst(&II, II.getType());
+        SmallVector<Value *, 2> Args(II.args());
+        Args[0] = BCSrc;
+        CallInst *NewCall = IC.Builder.CreateIntrinsic(
+            II.getIntrinsicID(), {BCSrc->getType()}, Args);
+        NewCall->takeName(&II);
+        return new BitCastInst(NewCall, II.getType());
       }
     }
 

>From 2b4c5903614ee3f18d80ae084a7ccb36234574f4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 25 Feb 2025 09:18:18 +0700
Subject: [PATCH 4/5] Use bundle guard

---
 llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 15d79ed884e58..9c5795ee78258 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1134,7 +1134,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
       // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
       if (isTypeLegal(BCSrc->getType())) {
-        SmallVector<Value *, 2> Args(II.args());
+        // Make sure convergence tokens are preserved.
+        // TODO: CreateIntrinsic should allow directly copying bundles
+        SmallVector<OperandBundleDef, 2> OpBundles;
+        II.getOperandBundlesAsDefs(OpBundles);
+
+        IRBuilderBase::OperandBundlesGuard Guard(IC.Builder);
+        IC.Builder.setDefaultOperandBundles(OpBundles);
+
+        SmallVector<Value *, 3> Args(II.args());
         Args[0] = BCSrc;
         CallInst *NewCall = IC.Builder.CreateIntrinsic(
             II.getIntrinsicID(), {BCSrc->getType()}, Args);

>From ad6c4d9af31e3515a2c9730f851fb36729a16ac9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 25 Feb 2025 09:22:04 +0700
Subject: [PATCH 5/5] Using CallInst directly actually works

---
 llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 11 ++++++-----
 .../InstCombine/AMDGPU/bitcast-fold-lane-ops.ll       |  8 ++++----
 llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll |  6 +++---
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 9c5795ee78258..cb918e16f0f3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1134,18 +1134,19 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
       // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
       if (isTypeLegal(BCSrc->getType())) {
+        Module *M = IC.Builder.GetInsertBlock()->getModule();
+        Function *Remangled =
+            Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
+
         // Make sure convergence tokens are preserved.
         // TODO: CreateIntrinsic should allow directly copying bundles
         SmallVector<OperandBundleDef, 2> OpBundles;
         II.getOperandBundlesAsDefs(OpBundles);
 
-        IRBuilderBase::OperandBundlesGuard Guard(IC.Builder);
-        IC.Builder.setDefaultOperandBundles(OpBundles);
-
         SmallVector<Value *, 3> Args(II.args());
         Args[0] = BCSrc;
-        CallInst *NewCall = IC.Builder.CreateIntrinsic(
-            II.getIntrinsicID(), {BCSrc->getType()}, Args);
+
+        CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
         NewCall->takeName(&II);
         return new BitCastInst(NewCall, II.getType());
       }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
index 490086afb51b2..e458fbd712370 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
@@ -288,8 +288,8 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_convergencetoken(float %val) c
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_convergencetoken(
 ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[T:%.*]] = call token @llvm.experimental.convergence.entry()
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT:    [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %t = call token @llvm.experimental.convergence.entry()
@@ -302,8 +302,8 @@ define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(float %val, i32 in
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(
 ; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[T:%.*]] = call token @llvm.experimental.convergence.entry()
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT:    [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %t = call token @llvm.experimental.convergence.entry()
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
index 5dd209316d6cb..c480ecf6a8b31 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
@@ -24,9 +24,9 @@ define i32 @test_bitcast_f32_to_i32_permlane64(float %val) {
 define i32 @test_bitcast_f32_to_i32_permlane64_convergencetokenn(float %val) convergent {
 ; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64_convergencetokenn(
 ; CHECK-NEXT:    [[T:%.*]] = call token @llvm.experimental.convergence.entry()
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]]) [ "convergencectrl"(token [[T]]) ]
-; CHECK-NEXT:    ret i32 [[RESULT]]
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    ret i32 [[BITCAST]]
 ;
   %t = call token @llvm.experimental.convergence.entry()
   %bitcast = bitcast float %val to i32



More information about the llvm-commits mailing list