[llvm] [AMDGPU] InstCombine llvm.amdgcn.ds.bpermute with uniform arguments (PR #130133)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 08:29:22 PST 2025
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/130133
Reland #129895 with a fix to avoid trying to combine bpermute of
bitcast.
>From 31ceb2a59f494e705f376d728559091a2a0cd16d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 6 Mar 2025 14:31:59 +0000
Subject: [PATCH] [AMDGPU] InstCombine llvm.amdgcn.ds.bpermute with uniform
arguments
Reland #129895 with a fix to avoid trying to combine bpermute of
bitcast.
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 27 +++++++++++--
.../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 39 +++++++++++++++++++
.../AMDGPU/bitcast-fold-lane-ops.ll | 12 ++++++
3 files changed, 74 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ebe740f884ea6..0e500cca8beb1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1118,9 +1118,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane: {
- // If the first argument is uniform these intrinsics return it unchanged.
- const Use &Src = II.getArgOperandUse(0);
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_ds_bpermute: {
+ // If the data argument is uniform these intrinsics return it unchanged.
+ unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
+ const Use &Src = II.getArgOperandUse(SrcIdx);
if (isTriviallyUniform(Src))
return IC.replaceInstUsesWith(II, Src.get());
@@ -1129,7 +1131,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return &II;
// readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
- if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
+ if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse() &&
+ IID != Intrinsic::amdgcn_ds_bpermute) {
Value *BCSrc = BC->getOperand(0);
// TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
@@ -1152,6 +1155,22 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
}
+ // If the lane argument of bpermute is uniform, change it to readlane. This
+ // generates better code and can enable further optimizations because
+ // readlane is AlwaysUniform.
+ if (IID == Intrinsic::amdgcn_ds_bpermute) {
+ const Use &Lane = II.getArgOperandUse(0);
+ if (isTriviallyUniform(Lane)) {
+ Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
+ Function *NewDecl = Intrinsic::getOrInsertDeclaration(
+ II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
+ II.setCalledFunction(NewDecl);
+ II.setOperand(0, Src);
+ II.setOperand(1, NewLane);
+ return &II;
+ }
+ }
+
return std::nullopt;
}
case Intrinsic::amdgcn_writelane: {
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 3c190efca7acf..843b436aa1b0f 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -6583,3 +6583,42 @@ define i32 @prng_poison_i32() {
%prng = call i32 @llvm.amdgcn.prng.b32(i32 poison)
ret i32 %prng
}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.ds.bpermute
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @ds_bpermute_uniform_src(ptr addrspace(1) %out, i32 %lane) {
+; CHECK-LABEL: @ds_bpermute_uniform_src(
+; CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane, i32 7)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ds_bpermute_constant_lane(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: @ds_bpermute_constant_lane(
+; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 7)
+; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %v = call i32 @llvm.amdgcn.ds.bpermute(i32 28, i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ds_bpermute_uniform_lane(ptr addrspace(1) %out, i32 %lanearg, i32 %src) {
+; CHECK-LABEL: @ds_bpermute_uniform_lane(
+; CHECK-NEXT: [[LANE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[LANEARG:%.*]])
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[LANE]], 2
+; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 [[TMP1]])
+; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %lane = call i32 @llvm.amdgcn.readfirstlane(i32 %lanearg)
+ %v = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane, i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
index e458fbd712370..02f50228339b1 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
@@ -311,3 +311,15 @@ define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(float %val, i32 in
%result = call i32 @llvm.amdgcn.readlane.i32(i32 %bitcast, i32 %lane.index) [ "convergencectrl"(token %t) ]
ret i32 %result
}
+
+define i32 @test_bitcast_f32_to_i32_ds_bpermute(float %val, i32 %addr) {
+; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_ds_bpermute(
+; CHECK-SAME: float [[VAL:%.*]], i32 [[ADDR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[ADDR]], i32 [[BITCAST]])
+; CHECK-NEXT: ret i32 [[RESULT]]
+;
+ %bitcast = bitcast float %val to i32
+ %result = call i32 @llvm.amdgcn.ds.bpermute(i32 %addr, i32 %bitcast)
+ ret i32 %result
+}
More information about the llvm-commits
mailing list