[llvm] [AMDGPU] Add new llvm.amdgcn.wave.shuffle intrinsic (PR #167372)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 07:00:09 PST 2025
https://github.com/saxlungs updated https://github.com/llvm/llvm-project/pull/167372
>From 83f5dd6766a1861ff32042a61b9080037f175f6e Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Mon, 10 Nov 2025 14:11:23 -0500
Subject: [PATCH 1/4] [AMDGPU] Add new llvm.amdgcn.subgroup.shuffle intrinsic
This intrinsic will be useful for implementing the OpGroupNonUniformShuffle operation in the SPIR-V reference
Signed-off-by: Domenic Nutile <domenic.nutile at gmail.com>
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 9 ++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 89 +++++++++++++++++++
.../AMDGPU/llvm.amdgcn.subgroup.shuffle.ll | 76 ++++++++++++++++
4 files changed, 178 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8e35109061792..a41723e1e9db8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2718,6 +2718,15 @@ def int_amdgcn_call_whole_wave:
llvm_vararg_ty], // The arguments to the callee.
[IntrConvergent]>;
+// <result>
+// llvm.amdgcn.subgroup.shuffle <value> <id>
+// value and result can be any scalar of floating-point, integer,
+// or Boolean types, but must be the same type
+def int_amdgcn_subgroup_shuffle :
+ Intrinsic<[llvm_any_ty], // return types
+ [LLVMMatchType<0>, llvm_i32_ty], // arg types
+ [IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
+
//===----------------------------------------------------------------------===//
// CI+ Intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cb27f474d78f3..8e225dc4735ad 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1894,6 +1894,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool requiresWaitsBeforeSystemScopeStores() const {
return RequiresWaitsBeforeSystemScopeStores;
}
+
+ bool supportsWaveWideBPermute() const {
+ return ((getGeneration() == AMDGPUSubtarget::GFX12) || isWave32());
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e37d739fc25df..17c4295ca6c22 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7280,6 +7280,93 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return DAG.getBitcast(VT, UnrolledLaneOp);
}
+// Right now, only subgroup.shuffle implemented, but other
+// future subgroup ops can use this function too
+static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IID = N->getConstantOperandVal(0);
+ SDLoc SL(N);
+
+ SDValue Value = N->getOperand(1);
+ SDValue Index = N->getOperand(2);
+
+ // ds_bpermute requires index to be multiplied by 4
+ SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
+ SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
+ ShiftAmount);
+
+ // Intrinsics will require i32 to operate on
+ SDValue Value32 = Value;
+ if ((ValSize != 32) || (VT.isFloatingPoint()))
+ Value32 = DAG.getBitcast(MVT::i32, Value);
+
+ auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
+ SmallVector<SDValue> IntrinArgs) -> SDValue {
+ SmallVector<SDValue> Operands(1);
+ Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
+ Operands.append(IntrinArgs);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
+ };
+
+ switch (IID) {
+ case Intrinsic::amdgcn_subgroup_shuffle:
+ if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+ // If we can bpermute across the whole wave, then just do that
+ SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {ShiftedIndex, Value32});
+ return DAG.getBitcast(VT, BPermute);
+ } else {
+ assert(TLI.getSubtarget()->isWave64());
+
+ // Otherwise, we need to make use of whole wave mode
+ SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
+ SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
+
+ // Set inactive lanes to poison
+ SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
+ MVT::i32, {Value32, PoisonVal});
+ SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
+ MVT::i32, {ShiftedIndex, PoisonIndex});
+
+ SDValue Swapped = MakeIntrinsic(Intrinsic::amdgcn_permlane64,
+ MVT::i32, {WWMValue});
+
+ // Get permutation of each half, then we'll select which one to use
+ SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, WWMValue});
+ SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, Swapped});
+ SDValue BPermOtherHalfWWM = MakeIntrinsic(Intrinsic::amdgcn_wwm,
+ MVT::i32, {BPermOtherHalf});
+
+ // Select which side to take the permute from
+ SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
+ SDValue ThreadIDLo = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
+ {ThreadIDMask,
+ DAG.getTargetConstant(0, SL,
+ MVT::i32)});
+ SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
+ {ThreadIDMask, ThreadIDLo});
+
+ SDValue SameOrOtherHalf = DAG.getNode(ISD::AND, SL, MVT::i32,
+ DAG.getNode(ISD::XOR, SL, MVT::i32,
+ ThreadID, Index),
+ DAG.getTargetConstant(32, SL,
+ MVT::i32));
+ SDValue UseSameHalf = DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
+ DAG.getConstant(0, SL, MVT::i32),
+ ISD::SETEQ);
+ SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf,
+ BPermSameHalf, BPermOtherHalfWWM);
+ return DAG.getBitcast(VT, Result);
+ }
+ default:
+ return SDValue();
+ }
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -10187,6 +10274,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Poisons.push_back(DAG.getPOISON(ValTy));
return DAG.getMergeValues(Poisons, SDLoc(Op));
}
+ case Intrinsic::amdgcn_subgroup_shuffle:
+ return lowerSubgroupOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
new file mode 100644
index 0000000000000..e31894c6cfa18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-64 %s
+
+declare float @llvm.amdgcn.subgroup.shuffle.float(float, i32)
+
+define float @test_subgroup_shuffle_scalar(float %val, i32 %idx) {
+; GFX11-LABEL: test_subgroup_shuffle_scalar:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_subgroup_shuffle_scalar:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-64-LABEL: test_subgroup_shuffle_scalar:
+; GFX11-64: ; %bb.0: ; %entry
+; GFX11-64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GFX11-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX11-64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
+; GFX11-64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11-64-NEXT: v_permlane64_b32 v2, v0
+; GFX11-64-NEXT: ds_bpermute_b32 v2, v3, v2
+; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
+; GFX11-64-NEXT: ds_bpermute_b32 v0, v3, v0
+; GFX11-64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
+; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-64-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX11-64-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-64-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-64-NEXT: v_and_b32_e32 v1, 32, v1
+; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX11-64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-64-NEXT: s_waitcnt vmcnt(0)
+; GFX11-64-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-64-LABEL: test_subgroup_shuffle_scalar:
+; GFX12-64: ; %bb.0: ; %entry
+; GFX12-64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-64-NEXT: s_wait_expcnt 0x0
+; GFX12-64-NEXT: s_wait_samplecnt 0x0
+; GFX12-64-NEXT: s_wait_bvhcnt 0x0
+; GFX12-64-NEXT: s_wait_kmcnt 0x0
+; GFX12-64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-64-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-64-NEXT: s_wait_dscnt 0x0
+; GFX12-64-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = tail call float @llvm.amdgcn.subgroup.shuffle(float %val, i32 %idx)
+ ret float %0
+}
>From 741566bef6e5ff882cf576fc68c13da2b81403e0 Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Mon, 10 Nov 2025 17:51:57 -0500
Subject: [PATCH 2/4] PR feedback
Update test prefixes, refactor lower function for just Subgroup Shuffle, clang format
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 110 +++++++--------
.../AMDGPU/llvm.amdgcn.subgroup.shuffle.ll | 126 +++++++++---------
2 files changed, 113 insertions(+), 123 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 17c4295ca6c22..3fd084cb8773e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7280,13 +7280,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return DAG.getBitcast(VT, UnrolledLaneOp);
}
-// Right now, only subgroup.shuffle implemented, but other
-// future subgroup ops can use this function too
-static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
- SelectionDAG &DAG) {
+static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
- unsigned IID = N->getConstantOperandVal(0);
SDLoc SL(N);
SDValue Value = N->getOperand(1);
@@ -7310,60 +7307,53 @@ static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
};
- switch (IID) {
- case Intrinsic::amdgcn_subgroup_shuffle:
- if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
- // If we can bpermute across the whole wave, then just do that
- SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
- MVT::i32, {ShiftedIndex, Value32});
- return DAG.getBitcast(VT, BPermute);
- } else {
- assert(TLI.getSubtarget()->isWave64());
-
- // Otherwise, we need to make use of whole wave mode
- SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
- SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
-
- // Set inactive lanes to poison
- SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
- MVT::i32, {Value32, PoisonVal});
- SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
- MVT::i32, {ShiftedIndex, PoisonIndex});
-
- SDValue Swapped = MakeIntrinsic(Intrinsic::amdgcn_permlane64,
- MVT::i32, {WWMValue});
-
- // Get permutation of each half, then we'll select which one to use
- SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
- MVT::i32, {WWMIndex, WWMValue});
- SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
- MVT::i32, {WWMIndex, Swapped});
- SDValue BPermOtherHalfWWM = MakeIntrinsic(Intrinsic::amdgcn_wwm,
- MVT::i32, {BPermOtherHalf});
-
- // Select which side to take the permute from
- SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
- SDValue ThreadIDLo = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
- {ThreadIDMask,
- DAG.getTargetConstant(0, SL,
- MVT::i32)});
- SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
- {ThreadIDMask, ThreadIDLo});
-
- SDValue SameOrOtherHalf = DAG.getNode(ISD::AND, SL, MVT::i32,
- DAG.getNode(ISD::XOR, SL, MVT::i32,
- ThreadID, Index),
- DAG.getTargetConstant(32, SL,
- MVT::i32));
- SDValue UseSameHalf = DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
- DAG.getConstant(0, SL, MVT::i32),
- ISD::SETEQ);
- SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf,
- BPermSameHalf, BPermOtherHalfWWM);
- return DAG.getBitcast(VT, Result);
- }
- default:
- return SDValue();
+ if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+ // If we can bpermute across the whole wave, then just do that
+ SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+ {ShiftedIndex, Value32});
+ return DAG.getBitcast(VT, BPermute);
+ } else {
+ assert(TLI.getSubtarget()->isWave64());
+
+ // Otherwise, we need to make use of whole wave mode
+ SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
+ SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
+
+ // Set inactive lanes to poison
+ SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+ {Value32, PoisonVal});
+ SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+ {ShiftedIndex, PoisonIndex});
+
+ SDValue Swapped =
+ MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
+
+ // Get permutation of each half, then we'll select which one to use
+ SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, WWMValue});
+ SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, Swapped});
+ SDValue BPermOtherHalfWWM =
+ MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
+
+ // Select which side to take the permute from
+ SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
+ SDValue ThreadIDLo =
+ MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
+ {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
+ SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
+ {ThreadIDMask, ThreadIDLo});
+
+ SDValue SameOrOtherHalf =
+ DAG.getNode(ISD::AND, SL, MVT::i32,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
+ DAG.getTargetConstant(32, SL, MVT::i32));
+ SDValue UseSameHalf =
+ DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
+ DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+ SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
+ BPermOtherHalfWWM);
+ return DAG.getBitcast(VT, Result);
}
}
@@ -10275,7 +10265,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(Poisons, SDLoc(Op));
}
case Intrinsic::amdgcn_subgroup_shuffle:
- return lowerSubgroupOp(*this, Op.getNode(), DAG);
+ return lowerSubgroupShuffle(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
index e31894c6cfa18..4572c0ff9a2f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
@@ -1,75 +1,75 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
declare float @llvm.amdgcn.subgroup.shuffle.float(float, i32)
define float @test_subgroup_shuffle_scalar(float %val, i32 %idx) {
-; GFX11-LABEL: test_subgroup_shuffle_scalar:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX11-NEXT: ds_bpermute_b32 v0, v1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-W32-LABEL: test_subgroup_shuffle_scalar:
+; GFX11-W32: ; %bb.0: ; %entry
+; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-W32-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_subgroup_shuffle_scalar:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT: ds_bpermute_b32 v0, v1, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-W32-LABEL: test_subgroup_shuffle_scalar:
+; GFX12-W32: ; %bb.0: ; %entry
+; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-W32-NEXT: s_wait_expcnt 0x0
+; GFX12-W32-NEXT: s_wait_samplecnt 0x0
+; GFX12-W32-NEXT: s_wait_bvhcnt 0x0
+; GFX12-W32-NEXT: s_wait_kmcnt 0x0
+; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-W32-NEXT: s_wait_dscnt 0x0
+; GFX12-W32-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-64-LABEL: test_subgroup_shuffle_scalar:
-; GFX11-64: ; %bb.0: ; %entry
-; GFX11-64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX11-64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
-; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11-64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
-; GFX11-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
-; GFX11-64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
-; GFX11-64-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX11-64-NEXT: v_permlane64_b32 v2, v0
-; GFX11-64-NEXT: ds_bpermute_b32 v2, v3, v2
-; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11-64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
-; GFX11-64-NEXT: ds_bpermute_b32 v0, v3, v0
-; GFX11-64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
-; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-64-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX11-64-NEXT: s_waitcnt lgkmcnt(1)
-; GFX11-64-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-64-NEXT: v_and_b32_e32 v1, 32, v1
-; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX11-64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX11-64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
-; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11-64-NEXT: s_waitcnt vmcnt(0)
-; GFX11-64-NEXT: s_setpc_b64 s[30:31]
+; GFX11-W64-LABEL: test_subgroup_shuffle_scalar:
+; GFX11-W64: ; %bb.0: ; %entry
+; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
+; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT: v_permlane64_b32 v2, v0
+; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2
+; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
+; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0
+; GFX11-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
+; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1
+; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX11-W64-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-64-LABEL: test_subgroup_shuffle_scalar:
-; GFX12-64: ; %bb.0: ; %entry
-; GFX12-64-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-64-NEXT: s_wait_expcnt 0x0
-; GFX12-64-NEXT: s_wait_samplecnt 0x0
-; GFX12-64-NEXT: s_wait_bvhcnt 0x0
-; GFX12-64-NEXT: s_wait_kmcnt 0x0
-; GFX12-64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-64-NEXT: ds_bpermute_b32 v0, v1, v0
-; GFX12-64-NEXT: s_wait_dscnt 0x0
-; GFX12-64-NEXT: s_setpc_b64 s[30:31]
+; GFX12-W64-LABEL: test_subgroup_shuffle_scalar:
+; GFX12-W64: ; %bb.0: ; %entry
+; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-W64-NEXT: s_wait_expcnt 0x0
+; GFX12-W64-NEXT: s_wait_samplecnt 0x0
+; GFX12-W64-NEXT: s_wait_bvhcnt 0x0
+; GFX12-W64-NEXT: s_wait_kmcnt 0x0
+; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-W64-NEXT: s_wait_dscnt 0x0
+; GFX12-W64-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call float @llvm.amdgcn.subgroup.shuffle(float %val, i32 %idx)
ret float %0
>From f3f133f48710db6dae46761b95152523a13c3f9a Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Wed, 19 Nov 2025 12:22:59 -0500
Subject: [PATCH 3/4] PR feedback, implement in GISel
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 16 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 125 +++++++++++++++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 11 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 104 ++++++-------
.../AMDGPU/llvm.amdgcn.subgroup.shuffle.ll | 76 ----------
.../AMDGPU/llvm.amdgcn.wave.shuffle.ll | 143 ++++++++++++++++++
8 files changed, 342 insertions(+), 136 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a41723e1e9db8..bb4b324d2de44 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2719,13 +2719,15 @@ def int_amdgcn_call_whole_wave:
[IntrConvergent]>;
// <result>
-// llvm.amdgcn.subgroup.shuffle <value> <id>
-// value and result can be any scalar of floating-point, integer,
-// or Boolean types, but must be the same type
-def int_amdgcn_subgroup_shuffle :
- Intrinsic<[llvm_any_ty], // return types
- [LLVMMatchType<0>, llvm_i32_ty], // arg types
- [IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
+// llvm.amdgcn.wave.shuffle <value> <id>
+// value and result can be 32bit floating-point, integer,
+// or Boolean types, and must be the same type. Any index
+// value that's outside the valid range will wrap around,
+// and reading from an inactive lane will return 0.
+def int_amdgcn_wave_shuffle :
+ DefaultAttrsIntrinsic<[llvm_any_ty], // return types
+ [LLVMMatchType<0>, llvm_i32_ty], // arg types
+ [IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
//===----------------------------------------------------------------------===//
// CI+ Intrinsics
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 650df2a87506a..d6a59823526b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1216,6 +1216,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_permlane16_swap:
case Intrinsic::amdgcn_permlane32_swap:
return selectPermlaneSwapIntrin(I, IntrinsicID);
+ case Intrinsic::amdgcn_wave_shuffle:
+ return selectWaveShuffleIntrin(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3852,6 +3854,129 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
return true;
}
+bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
+ MachineInstr &MI) const {
+ assert(MI.getNumOperands() == 4);
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register ValReg = MI.getOperand(2).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
+
+ const LLT DstTy = MRI->getType(DstReg);
+ unsigned DstSize = DstTy.getSizeInBits();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
+
+ assert(DstTy == LLT::scalar(32));
+
+ // If we can bpermute across the whole wave, then just do that
+ if (Subtarget->supportsWaveWideBPermute()) {
+ Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+ .addImm(2)
+ .addReg(IdxReg);
+
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+ .addReg(ShiftIdxReg)
+ .addReg(ValReg)
+ .addImm(0);
+ } else {
+ // Otherwise, we need to make use of whole wave mode
+ assert(Subtarget->isWave64());
+
+ // Set inactive lanes to poison
+ Register UndefValReg =
+ MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
+
+ Register UndefExecReg = MRI->createVirtualRegister(
+ TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
+
+ Register PoisonValReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
+ .addImm(0)
+ .addReg(ValReg)
+ .addImm(0)
+ .addReg(UndefValReg)
+ .addReg(UndefExecReg);
+
+ // ds_bpermute requires index to be multiplied by 4
+ Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+ .addImm(2)
+ .addReg(IdxReg);
+
+ Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
+ .addImm(0)
+ .addReg(ShiftIdxReg)
+ .addImm(0)
+ .addReg(UndefValReg)
+ .addReg(UndefExecReg);
+
+ // Get permutation of each half, then we'll select which one to use
+ Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
+ .addReg(PoisonIdxReg)
+ .addReg(PoisonValReg)
+ .addImm(0);
+
+ Register SwappedValReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
+ .addReg(PoisonValReg);
+
+ Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
+ .addReg(PoisonIdxReg)
+ .addReg(SwappedValReg)
+ .addImm(0);
+
+ Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
+ .addReg(OppSidePermReg);
+
+ // Select which side to take the permute from
+ // We can get away with only using mbcnt_lo here since we're only
+ // trying to detect which side of 32 each lane is on, and mbcnt_lo
+ // returns 32 for lanes 32-63.
+ Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
+ .addImm(-1)
+ .addImm(0);
+
+ Register XORReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
+ .addReg(ThreadIDReg)
+ .addReg(PoisonIdxReg);
+
+ Register ANDReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
+ .addReg(XORReg)
+ .addImm(32);
+
+ Register CompareReg = MRI->createVirtualRegister(
+ TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
+ .addReg(ANDReg)
+ .addImm(0);
+
+ // Finally do the selection
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .addReg(WWMSwapPermReg)
+ .addImm(0)
+ .addReg(SameSidePermReg)
+ .addReg(CompareReg);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
// Match BITOP3 operation and return a number of matched instructions plus
// truth table.
static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c760fe7ef99dd..627cce277ae38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
+ bool selectWaveShuffleIntrin(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7ed026ee5f69e..7d838c58d607d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5230,11 +5230,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
break;
}
- case Intrinsic::amdgcn_s_bitreplicate:
+ case Intrinsic::amdgcn_s_bitreplicate: {
Register MaskReg = MI.getOperand(2).getReg();
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_wave_shuffle: {
+ unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ break;
+ }
}
break;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8e225dc4735ad..df98d473e16e2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1896,7 +1896,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
}
bool supportsWaveWideBPermute() const {
- return ((getGeneration() == AMDGPUSubtarget::GFX12) || isWave32());
+ return getGeneration() == AMDGPUSubtarget::GFX12 || isWave32();
}
};
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3fd084cb8773e..e271298527faa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7280,24 +7280,25 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return DAG.getBitcast(VT, UnrolledLaneOp);
}
-static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
+static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
+ assert(ValSize == 32);
SDLoc SL(N);
SDValue Value = N->getOperand(1);
SDValue Index = N->getOperand(2);
// ds_bpermute requires index to be multiplied by 4
- SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
+ SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
ShiftAmount);
// Intrinsics will require i32 to operate on
- SDValue Value32 = Value;
- if ((ValSize != 32) || (VT.isFloatingPoint()))
- Value32 = DAG.getBitcast(MVT::i32, Value);
+ SDValue ValueI32 = Value;
+ if (VT.isFloatingPoint())
+ ValueI32 = DAG.getBitcast(MVT::i32, Value);
auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
SmallVector<SDValue> IntrinArgs) -> SDValue {
@@ -7307,54 +7308,55 @@ static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
};
+ // If we can bpermute across the whole wave, then just do that
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
- // If we can bpermute across the whole wave, then just do that
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
- {ShiftedIndex, Value32});
+ {ShiftedIndex, ValueI32});
return DAG.getBitcast(VT, BPermute);
- } else {
- assert(TLI.getSubtarget()->isWave64());
-
- // Otherwise, we need to make use of whole wave mode
- SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
- SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
-
- // Set inactive lanes to poison
- SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
- {Value32, PoisonVal});
- SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
- {ShiftedIndex, PoisonIndex});
-
- SDValue Swapped =
- MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
-
- // Get permutation of each half, then we'll select which one to use
- SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
- MVT::i32, {WWMIndex, WWMValue});
- SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
- MVT::i32, {WWMIndex, Swapped});
- SDValue BPermOtherHalfWWM =
- MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
-
- // Select which side to take the permute from
- SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
- SDValue ThreadIDLo =
- MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
- {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
- SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
- {ThreadIDMask, ThreadIDLo});
-
- SDValue SameOrOtherHalf =
- DAG.getNode(ISD::AND, SL, MVT::i32,
- DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
- DAG.getTargetConstant(32, SL, MVT::i32));
- SDValue UseSameHalf =
- DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
- DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
- SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
- BPermOtherHalfWWM);
- return DAG.getBitcast(VT, Result);
}
+
+ assert(TLI.getSubtarget()->isWave64());
+
+ // Otherwise, we need to make use of whole wave mode
+ SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
+ SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
+
+ // Set inactive lanes to poison
+ SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+ {ValueI32, PoisonVal});
+ SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+ {ShiftedIndex, PoisonIndex});
+
+ SDValue Swapped =
+ MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
+
+ // Get permutation of each half, then we'll select which one to use
+ SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, WWMValue});
+ SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+ MVT::i32, {WWMIndex, Swapped});
+ SDValue BPermOtherHalfWWM =
+ MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
+
+ // Select which side to take the permute from
+ SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
+ // We can get away with only using mbcnt_lo here since we're only
+ // trying to detect which side of 32 each lane is on, and mbcnt_lo
+ // returns 32 for lanes 32-63.
+ SDValue ThreadID =
+ MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
+ {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
+
+ SDValue SameOrOtherHalf =
+ DAG.getNode(ISD::AND, SL, MVT::i32,
+ DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
+ DAG.getTargetConstant(32, SL, MVT::i32));
+ SDValue UseSameHalf =
+ DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
+ DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+ SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
+ BPermOtherHalfWWM);
+ return DAG.getBitcast(VT, Result);
}
void SITargetLowering::ReplaceNodeResults(SDNode *N,
@@ -10264,8 +10266,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Poisons.push_back(DAG.getPOISON(ValTy));
return DAG.getMergeValues(Poisons, SDLoc(Op));
}
- case Intrinsic::amdgcn_subgroup_shuffle:
- return lowerSubgroupShuffle(*this, Op.getNode(), DAG);
+ case Intrinsic::amdgcn_wave_shuffle:
+ return lowerWaveShuffle(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
deleted file mode 100644
index 4572c0ff9a2f1..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
-
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
-
-declare float @llvm.amdgcn.subgroup.shuffle.float(float, i32)
-
-define float @test_subgroup_shuffle_scalar(float %val, i32 %idx) {
-; GFX11-W32-LABEL: test_subgroup_shuffle_scalar:
-; GFX11-W32: ; %bb.0: ; %entry
-; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0
-; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-W32-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-W32-LABEL: test_subgroup_shuffle_scalar:
-; GFX12-W32: ; %bb.0: ; %entry
-; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-W32-NEXT: s_wait_expcnt 0x0
-; GFX12-W32-NEXT: s_wait_samplecnt 0x0
-; GFX12-W32-NEXT: s_wait_bvhcnt 0x0
-; GFX12-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0
-; GFX12-W32-NEXT: s_wait_dscnt 0x0
-; GFX12-W32-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-W64-LABEL: test_subgroup_shuffle_scalar:
-; GFX11-W64: ; %bb.0: ; %entry
-; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
-; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
-; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
-; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
-; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX11-W64-NEXT: v_permlane64_b32 v2, v0
-; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2
-; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
-; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0
-; GFX11-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
-; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1)
-; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1
-; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
-; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11-W64-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-W64-LABEL: test_subgroup_shuffle_scalar:
-; GFX12-W64: ; %bb.0: ; %entry
-; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-W64-NEXT: s_wait_expcnt 0x0
-; GFX12-W64-NEXT: s_wait_samplecnt 0x0
-; GFX12-W64-NEXT: s_wait_bvhcnt 0x0
-; GFX12-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0
-; GFX12-W64-NEXT: s_wait_dscnt 0x0
-; GFX12-W64-NEXT: s_setpc_b64 s[30:31]
-entry:
- %0 = tail call float @llvm.amdgcn.subgroup.shuffle(float %val, i32 %idx)
- ret float %0
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll
new file mode 100644
index 0000000000000..96039dc11c70b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32-GISEL %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64-GISEL %s
+
+declare float @llvm.amdgcn.wave.shuffle.float(float, i32)
+
+define float @test_wave_shuffle_float(float %val, i32 %idx) {
+; GFX11-W32-LABEL: test_wave_shuffle_float:
+; GFX11-W32: ; %bb.0: ; %entry
+; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-W32-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-W32-LABEL: test_wave_shuffle_float:
+; GFX12-W32: ; %bb.0: ; %entry
+; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-W32-NEXT: s_wait_expcnt 0x0
+; GFX12-W32-NEXT: s_wait_samplecnt 0x0
+; GFX12-W32-NEXT: s_wait_bvhcnt 0x0
+; GFX12-W32-NEXT: s_wait_kmcnt 0x0
+; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-W32-NEXT: s_wait_dscnt 0x0
+; GFX12-W32-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-W64-LABEL: test_wave_shuffle_float:
+; GFX11-W64: ; %bb.0: ; %entry
+; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
+; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT: v_permlane64_b32 v2, v0
+; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2
+; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0
+; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1
+; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX11-W64-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-W64-LABEL: test_wave_shuffle_float:
+; GFX12-W64: ; %bb.0: ; %entry
+; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-W64-NEXT: s_wait_expcnt 0x0
+; GFX12-W64-NEXT: s_wait_samplecnt 0x0
+; GFX12-W64-NEXT: s_wait_bvhcnt 0x0
+; GFX12-W64-NEXT: s_wait_kmcnt 0x0
+; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-W64-NEXT: s_wait_dscnt 0x0
+; GFX12-W64-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-W32-GISEL-LABEL: test_wave_shuffle_float:
+; GFX11-W32-GISEL: ; %bb.0: ; %entry
+; GFX11-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX11-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-W32-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-W32-GISEL-LABEL: test_wave_shuffle_float:
+; GFX12-W32-GISEL: ; %bb.0: ; %entry
+; GFX12-W32-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-W32-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-W32-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-W32-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-W32-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-W32-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-W32-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-W64-GISEL-LABEL: test_wave_shuffle_float:
+; GFX11-W64-GISEL: ; %bb.0: ; %entry
+; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-GISEL-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
+; GFX11-W64-GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11-W64-GISEL-NEXT: v_permlane64_b32 v2, v0
+; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v2, v1, v2
+; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX11-W64-GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-W64-GISEL-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX11-W64-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-W64-GISEL-NEXT: v_and_b32_e32 v1, 32, v1
+; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-W64-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-W64-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-GISEL-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-W64-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-W64-GISEL-LABEL: test_wave_shuffle_float:
+; GFX12-W64-GISEL: ; %bb.0: ; %entry
+; GFX12-W64-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-W64-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-W64-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-W64-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-W64-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
+; GFX12-W64-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-W64-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = tail call float @llvm.amdgcn.wave.shuffle(float %val, i32 %idx)
+ ret float %0
+}
>From c2d8e0e26c50a4fe38d16e220a1a396a3c2b9991 Mon Sep 17 00:00:00 2001
From: saxlungs <152745038+saxlungs at users.noreply.github.com>
Date: Thu, 20 Nov 2025 09:59:58 -0500
Subject: [PATCH 4/4] Apply suggestions from code review
Co-authored-by: Jay Foad <jay.foad at gmail.com>
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +--
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bb4b324d2de44..5039bf4fec850 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2727,7 +2727,7 @@ def int_amdgcn_call_whole_wave:
def int_amdgcn_wave_shuffle :
DefaultAttrsIntrinsic<[llvm_any_ty], // return types
[LLVMMatchType<0>, llvm_i32_ty], // arg types
- [IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
+ [IntrConvergent, IntrNoMem]>; // flags
//===----------------------------------------------------------------------===//
// CI+ Intrinsics
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e271298527faa..0ca6b94b6561e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7283,8 +7283,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- unsigned ValSize = VT.getSizeInBits();
- assert(ValSize == 32);
+ assert(VT.getSizeInBits() == 32);
SDLoc SL(N);
SDValue Value = N->getOperand(1);
More information about the llvm-commits
mailing list