[llvm] [AMDGPU] Add DPP16 Row Share optimization for llvm.amdgcn.wave.shuffle (PR #177470)
Domenic Nutile via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 2 07:36:08 PST 2026
https://github.com/saxlungs updated https://github.com/llvm/llvm-project/pull/177470
>From 30a6e7136ec7662ee74624cd05d74059c5c78bbe Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Thu, 22 Jan 2026 16:05:11 -0500
Subject: [PATCH 1/7] [AMDGPU] Add DPP16 Row Share optimization for
llvm.amdgcn.wave.shuffle
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 91 +++++++++++++
.../AMDGPU/llvm.amdgcn.wave.shuffle.ll | 122 ++++++++++++++++++
2 files changed, 213 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2cd1902785546..7b39dc264a9d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -553,6 +553,91 @@ static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
return NewCall;
}
+// Return true for sequences of instructions that effectively assign
+// each lane to its thread ID
+bool isThreadID(Value *V) {
+ // Case 1:
+ // wave32: mbcnt_lo(-1, 0)
+ // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
+ ConstantInt *HiMask, *LoMask, *Input;
+ auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_ConstantInt(LoMask),
+ m_ConstantInt(Input));
+ auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
+ m_ConstantInt(HiMask), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+ m_ConstantInt(LoMask), m_ConstantInt(Input)));
+ if (match(V, W32Pred) && LoMask->getSExtValue() == -1 &&
+ Input->getZExtValue() == 0)
+ return true;
+ if (match(V, W64Pred) && HiMask->getSExtValue() == -1 &&
+ LoMask->getSExtValue() == -1 && Input->getZExtValue() == 0)
+ return true;
+
+ // Case 2:
+ // workitem.x()
+ CallInst *WIdX = dyn_cast<CallInst>(V);
+ if (WIdX && WIdX->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
+ return true;
+
+ return false;
+}
+
+// Attempt to capture situations where the index argument matches
+// a DPP pattern, and convert to a DPP-based mov
+std::optional<Instruction *> tryWaveShuffleDPP(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *Val = II.getArgOperand(0);
+ Value *Idx = II.getArgOperand(1);
+ auto &B = IC.Builder;
+
+ // DPP16 Row Share 0: Idx = Tid & Mask where (Mask & 0x1F) = 0x10
+ Value *Tid;
+ ConstantInt *Mask;
+ auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
+ if (match(Idx, RowShare0Pred) && isThreadID(Tid) &&
+ (Mask->getZExtValue() & 0x1F) == 0x10) {
+ CallInst *UpdateDPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0), B.getInt32(0xF),
+ B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
+
+ // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | Row where (Mask &
+ // 0x1F) = 0x10
+ ConstantInt *Row;
+ auto RowSharePred =
+ m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(Row));
+ if (match(Idx, RowSharePred) && isThreadID(Tid) &&
+ (Mask->getZExtValue() & 0x1F) == 0x10 && Row->getZExtValue() < 15) {
+ CallInst *UpdateDPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {B.getInt32(0), Val,
+ B.getInt32(AMDGPU::DPP::ROW_SHR0 | Row->getZExtValue()),
+ B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
+
+ // DPP16 Row Share 15: Idx = Tid | 0xF
+ auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(Row));
+ if (match(Idx, RowShare15Pred) && isThreadID(Tid) &&
+ Row->getZExtValue() == 15) {
+ CallInst *UpdateDPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR_LAST),
+ B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
+
+ // No valid DPP detected
+ return std::nullopt;
+}
+
Instruction *
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
IntrinsicInst &II) const {
@@ -1759,6 +1844,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->copyMetadata(II);
return IC.eraseInstFromFunction(II);
}
+ case Intrinsic::amdgcn_wave_shuffle: {
+ if (!ST->hasDPP())
+ return std::nullopt;
+
+ return tryWaveShuffleDPP(IC, II);
+ }
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
new file mode 100644
index 0000000000000..3161a0a55e274
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -passes=instcombine -S < %s | FileCheck %s
+
+define i32 @wave_shuffle_self_select(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_self_select(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID]])
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %tid)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_dpp_row_share_0(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_dpp_row_share_0(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %masked = and i32 %tid, 65520 ; 0xFFF0
+ %share_0 = or i32 %masked, 0
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_0)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_dpp_row_share_7(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_dpp_row_share_7(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %masked = and i32 %tid, 65520 ; 0xFFF0
+ %share_7 = or i32 %masked, 7
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_7)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_dpp_row_share_7_no_mask(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_dpp_row_share_7_no_mask(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT: [[SHARE_7:%.*]] = or i32 [[TID]], 7
+; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %share_7 = or i32 %tid, 7
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_7)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_not_quite_row_share(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_not_quite_row_share(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280
+; CHECK-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 55
+; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %masked = and i32 %tid, 65280 ; 0xFF00
+ %or_res = or i32 %masked, 55 ; 0x37
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %or_res)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_workitem_row_share_14(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_14(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %masked = and i32 %tid, 65520 ; 0xFFF0
+ %share_14 = or i32 %masked, 14
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_14)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_14_no_mask(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
+; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %share_14 = or i32 %tid, 14
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_14)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_workitem_row_share_15(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_15(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %masked = and i32 %tid, 65520 ; 0xFFF0
+ %share_15 = or i32 %masked, 15
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_15)
+ ret i32 %res
+}
+
+define i32 @wave_shuffle_workitem_row_share_15_no_mask(i32 %val) {
+; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_15_no_mask(
+; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %share_15 = or i32 %tid, 15
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_15)
+ ret i32 %res
+}
>From a10413d49a3f78b3888d0c61a54cecbd75ca4ee5 Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Thu, 22 Jan 2026 19:21:34 -0500
Subject: [PATCH 2/7] Improve w32 vs w64 handling, add more testing
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 50 ++--
.../AMDGPU/llvm.amdgcn.wave.shuffle.ll | 230 +++++++++++++-----
2 files changed, 202 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 7b39dc264a9d2..5c1d026796c70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -555,7 +555,7 @@ static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
// Return true for sequences of instructions that effectively assign
// each lane to its thread ID
-bool isThreadID(Value *V) {
+bool isThreadID(const GCNSubtarget *ST, Value *V) {
// Case 1:
// wave32: mbcnt_lo(-1, 0)
// wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
@@ -565,10 +565,10 @@ bool isThreadID(Value *V) {
auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
m_ConstantInt(HiMask), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
m_ConstantInt(LoMask), m_ConstantInt(Input)));
- if (match(V, W32Pred) && LoMask->getSExtValue() == -1 &&
+ if (ST->isWave32() && match(V, W32Pred) && LoMask->getSExtValue() == -1 &&
Input->getZExtValue() == 0)
return true;
- if (match(V, W64Pred) && HiMask->getSExtValue() == -1 &&
+ if (ST->isWave64() && match(V, W64Pred) && HiMask->getSExtValue() == -1 &&
LoMask->getSExtValue() == -1 && Input->getZExtValue() == 0)
return true;
@@ -583,18 +583,24 @@ bool isThreadID(Value *V) {
// Attempt to capture situations where the index argument matches
// a DPP pattern, and convert to a DPP-based mov
-std::optional<Instruction *> tryWaveShuffleDPP(InstCombiner &IC,
- IntrinsicInst &II) {
+std::optional<Instruction *>
+tryWaveShuffleDPP(const GCNSubtarget *ST, InstCombiner &IC, IntrinsicInst &II) {
Value *Val = II.getArgOperand(0);
Value *Idx = II.getArgOperand(1);
auto &B = IC.Builder;
- // DPP16 Row Share 0: Idx = Tid & Mask where (Mask & 0x1F) = 0x10
+ // DPP16 Row Share 0: Idx = Tid & Mask
+ // wave32 requires Mask & 0x1F = 0x10
+ // wave64 requires Mask & 0x3F = 0x30
Value *Tid;
ConstantInt *Mask;
auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
- if (match(Idx, RowShare0Pred) && isThreadID(Tid) &&
- (Mask->getZExtValue() & 0x1F) == 0x10) {
+ if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
+ if (ST->isWave32() && (Mask->getZExtValue() & 0x1F) != 0x10)
+ return std::nullopt;
+ if (ST->isWave64() && (Mask->getZExtValue() & 0x3F) != 0x30)
+ return std::nullopt;
+
CallInst *UpdateDPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Val->getType(),
{B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0), B.getInt32(0xF),
@@ -604,17 +610,23 @@ std::optional<Instruction *> tryWaveShuffleDPP(InstCombiner &IC,
return IC.replaceInstUsesWith(II, UpdateDPP);
}
- // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | Row where (Mask &
- // 0x1F) = 0x10
- ConstantInt *Row;
+ // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
+ // wave32 requires Mask & 0x1F = 0x10
+ // wave64 requires Mask & 0x3F = 0x30
+ ConstantInt *RowIdx;
auto RowSharePred =
- m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(Row));
- if (match(Idx, RowSharePred) && isThreadID(Tid) &&
- (Mask->getZExtValue() & 0x1F) == 0x10 && Row->getZExtValue() < 15) {
+ m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
+ if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
+ RowIdx->getZExtValue() < 15 && RowIdx->getZExtValue() > 0) {
+ if (ST->isWave32() && (Mask->getZExtValue() & 0x1F) != 0x10)
+ return std::nullopt;
+ if (ST->isWave64() && (Mask->getZExtValue() & 0x3F) != 0x30)
+ return std::nullopt;
+
CallInst *UpdateDPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Val->getType(),
{B.getInt32(0), Val,
- B.getInt32(AMDGPU::DPP::ROW_SHR0 | Row->getZExtValue()),
+ B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx->getZExtValue()),
B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
UpdateDPP->takeName(&II);
UpdateDPP->copyMetadata(II);
@@ -622,9 +634,9 @@ std::optional<Instruction *> tryWaveShuffleDPP(InstCombiner &IC,
}
// DPP16 Row Share 15: Idx = Tid | 0xF
- auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(Row));
- if (match(Idx, RowShare15Pred) && isThreadID(Tid) &&
- Row->getZExtValue() == 15) {
+ auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(RowIdx));
+ if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
+ RowIdx->getZExtValue() == 15) {
CallInst *UpdateDPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Val->getType(),
{B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR_LAST),
@@ -1848,7 +1860,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!ST->hasDPP())
return std::nullopt;
- return tryWaveShuffleDPP(IC, II);
+ return tryWaveShuffleDPP(ST, IC, II);
}
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
index 3161a0a55e274..1b826a5532640 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -1,79 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-W64 %s
-define i32 @wave_shuffle_self_select(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_self_select(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID]])
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_self_select(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_self_select(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-W32-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID]])
+; CHECK-W32-NEXT: ret i32 [[RES]]
;
- %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_self_select(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-NEXT: [[TID1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TID]])
+; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID1]])
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %tid)
ret i32 %res
}
-define i32 @wave_shuffle_dpp_row_share_0(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_dpp_row_share_0(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
;
- %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%masked = and i32 %tid, 65520 ; 0xFFF0
%share_0 = or i32 %masked, 0
%res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_0)
ret i32 %res
}
-define i32 @wave_shuffle_dpp_row_share_7(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_dpp_row_share_7(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_dpp_row_share_7(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
;
- %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
- %masked = and i32 %tid, 65520 ; 0xFFF0
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ %masked = and i32 %tid, 48 ; 0x30
%share_7 = or i32 %masked, 7
%res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_7)
ret i32 %res
}
-define i32 @wave_shuffle_dpp_row_share_7_no_mask(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_dpp_row_share_7_no_mask(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-NEXT: [[SHARE_7:%.*]] = or i32 [[TID]], 7
-; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W32-NEXT: [[SHARE_7:%.*]] = or i32 [[TID]], 7
+; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-W32-NEXT: ret i32 [[RES]]
;
- %tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-NEXT: [[TID1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TID]])
+; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or i32 [[TID1]], 7
+; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%share_7 = or i32 %tid, 7
%res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_7)
ret i32 %res
}
-define i32 @wave_shuffle_not_quite_row_share(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_not_quite_row_share(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280
-; CHECK-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 55
-; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
-; CHECK-NEXT: ret i32 [[RES]]
+; Doing both mbcnt.lo and mbcnt.hi works for both wave32 and wave64 because the
+; mbcnt.hi is optimized away for wave32. However, ommitting mbcnt.hi should prevent
+; wave64 from optimizing to dpp.
+define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-W64-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %masked = and i32 %tid, 65520 ; 0xFFF0
+ %share_7 = or i32 %masked, 7
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_7)
+ ret i32 %res
+}
+
+; The mask requirements for wave32 and wave64 are slightly different since wave64
+; has 4 rows. This test has a mask that should only be valid for wave32 to be
+; optimized to dpp.
+define i32 @test_wave_shuffle_dpp_row_share_w32_mask(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 16
+; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ %masked = and i32 %tid, 16 ; 0x10
+ %share_7 = or i32 %masked, 7
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %share_7)
+ ret i32 %res
+}
+
+define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_not_quite_row_share(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W32-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280
+; CHECK-W32-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55
+; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_not_quite_row_share(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-W64-NEXT: [[TID1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TID]])
+; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID1]], 65280
+; CHECK-W64-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55
+; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%masked = and i32 %tid, 65280 ; 0xFF00
%or_res = or i32 %masked, 55 ; 0x37
%res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %or_res)
ret i32 %res
}
-define i32 @wave_shuffle_workitem_row_share_14(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_14(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_workitem_row_share_14(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -82,13 +177,20 @@ define i32 @wave_shuffle_workitem_row_share_14(i32 %val) {
ret i32 %res
}
-define i32 @wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_14_no_mask(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
-; CHECK-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-W32-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
+; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-W64-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
+; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-W64-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%share_14 = or i32 %tid, 14
@@ -96,11 +198,16 @@ define i32 @wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
ret i32 %res
}
-define i32 @wave_shuffle_workitem_row_share_15(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_15(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_workitem_row_share_15(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -109,11 +216,16 @@ define i32 @wave_shuffle_workitem_row_share_15(i32 %val) {
ret i32 %res
}
-define i32 @wave_shuffle_workitem_row_share_15_no_mask(i32 %val) {
-; CHECK-LABEL: define i32 @wave_shuffle_workitem_row_share_15_no_mask(
-; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
-; CHECK-NEXT: ret i32 [[RES]]
+define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%share_15 = or i32 %tid, 15
>From 3a636beb5dc63566285b8e9b2b8d090ab8f9be40 Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Tue, 27 Jan 2026 12:15:00 -0500
Subject: [PATCH 3/7] First round of PR feedback
Fix several issues raised in PR feedback, still a few more things to address
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 136 +++++++++---------
.../AMDGPU/llvm.amdgcn.wave.shuffle.ll | 88 ++++++++++++
2 files changed, 156 insertions(+), 68 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 5c1d026796c70..4f8b94ae386f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -555,27 +555,24 @@ static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
// Return true for sequences of instructions that effectively assign
// each lane to its thread ID
-bool isThreadID(const GCNSubtarget *ST, Value *V) {
+static bool isThreadID(const GCNSubtarget &ST, Value *V) {
// Case 1:
// wave32: mbcnt_lo(-1, 0)
// wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
- ConstantInt *HiMask, *LoMask, *Input;
- auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_ConstantInt(LoMask),
- m_ConstantInt(Input));
+ auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_ConstantInt<-1>(),
+ m_ConstantInt<0>());
auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
- m_ConstantInt(HiMask), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
- m_ConstantInt(LoMask), m_ConstantInt(Input)));
- if (ST->isWave32() && match(V, W32Pred) && LoMask->getSExtValue() == -1 &&
- Input->getZExtValue() == 0)
+ m_ConstantInt<-1>(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+ m_ConstantInt<-1>(), m_ConstantInt<0>()));
+ if (ST.isWave32() && match(V, W32Pred))
return true;
- if (ST->isWave64() && match(V, W64Pred) && HiMask->getSExtValue() == -1 &&
- LoMask->getSExtValue() == -1 && Input->getZExtValue() == 0)
+ if (ST.isWave64() && match(V, W64Pred))
return true;
// Case 2:
// workitem.x()
- CallInst *WIdX = dyn_cast<CallInst>(V);
- if (WIdX && WIdX->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
+ auto WIdXPred = m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>();
+ if (match(V, WIdXPred))
return true;
return false;
@@ -583,67 +580,70 @@ bool isThreadID(const GCNSubtarget *ST, Value *V) {
// Attempt to capture situations where the index argument matches
// a DPP pattern, and convert to a DPP-based mov
-std::optional<Instruction *>
-tryWaveShuffleDPP(const GCNSubtarget *ST, InstCombiner &IC, IntrinsicInst &II) {
+static std::optional<Instruction *>
+tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
Value *Val = II.getArgOperand(0);
Value *Idx = II.getArgOperand(1);
auto &B = IC.Builder;
- // DPP16 Row Share 0: Idx = Tid & Mask
- // wave32 requires Mask & 0x1F = 0x10
- // wave64 requires Mask & 0x3F = 0x30
- Value *Tid;
- ConstantInt *Mask;
- auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
- if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
- if (ST->isWave32() && (Mask->getZExtValue() & 0x1F) != 0x10)
- return std::nullopt;
- if (ST->isWave64() && (Mask->getZExtValue() & 0x3F) != 0x30)
- return std::nullopt;
+ // DPP16 Row Share requires GFX10 or later
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ // DPP17 Row Share 0: Idx = Tid & Mask
+ // wave32 requires Mask & 0x1F = 0x10
+ // wave64 requires Mask & 0x3F = 0x30
+ Value *Tid;
+ uint64_t Mask;
+ auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
+ if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
+ if (ST.isWave32() && (Mask & 0x1F) != 0x10)
+ return std::nullopt;
+ if (ST.isWave64() && (Mask & 0x3F) != 0x30)
+ return std::nullopt;
- CallInst *UpdateDPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0), B.getInt32(0xF),
- B.getInt32(0xF), B.getFalse()});
- UpdateDPP->takeName(&II);
- UpdateDPP->copyMetadata(II);
- return IC.replaceInstUsesWith(II, UpdateDPP);
- }
-
- // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
- // wave32 requires Mask & 0x1F = 0x10
- // wave64 requires Mask & 0x3F = 0x30
- ConstantInt *RowIdx;
- auto RowSharePred =
- m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
- if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
- RowIdx->getZExtValue() < 15 && RowIdx->getZExtValue() > 0) {
- if (ST->isWave32() && (Mask->getZExtValue() & 0x1F) != 0x10)
- return std::nullopt;
- if (ST->isWave64() && (Mask->getZExtValue() & 0x3F) != 0x30)
- return std::nullopt;
+ CallInst *UpdateDPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0),
+ B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
+
+ // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
+ // wave32 requires Mask & 0x1F = 0x10
+ // wave64 requires Mask & 0x3F = 0x30
+ ConstantInt *RowIdx;
+ auto RowSharePred =
+ m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
+ if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
+ RowIdx->getZExtValue() < 15 && RowIdx->getZExtValue() > 0) {
+ if (ST.isWave32() && (Mask & 0x1F) != 0x10)
+ return std::nullopt;
+ if (ST.isWave64() && (Mask & 0x3F) != 0x30)
+ return std::nullopt;
- CallInst *UpdateDPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val,
- B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx->getZExtValue()),
- B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
- UpdateDPP->takeName(&II);
- UpdateDPP->copyMetadata(II);
- return IC.replaceInstUsesWith(II, UpdateDPP);
- }
-
- // DPP16 Row Share 15: Idx = Tid | 0xF
- auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(RowIdx));
- if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
- RowIdx->getZExtValue() == 15) {
- CallInst *UpdateDPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR_LAST),
- B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
- UpdateDPP->takeName(&II);
- UpdateDPP->copyMetadata(II);
- return IC.replaceInstUsesWith(II, UpdateDPP);
+ CallInst *UpdateDPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {B.getInt32(0), Val,
+ B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx->getZExtValue()),
+ B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
+
+ // DPP16 Row Share 15: Idx = Tid | 0xF
+ auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(RowIdx));
+ if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
+ RowIdx->getZExtValue() == 15) {
+ CallInst *UpdateDPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR_LAST),
+ B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+ }
}
// No valid DPP detected
@@ -1860,7 +1860,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!ST->hasDPP())
return std::nullopt;
- return tryWaveShuffleDPP(ST, IC, II);
+ return tryWaveShuffleDPP(*ST, IC, II);
}
}
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
index 1b826a5532640..143ec450c0b69 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+dpp -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-DPP %s
define i32 @test_wave_shuffle_self_select(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_self_select(
@@ -15,6 +16,13 @@ define i32 @test_wave_shuffle_self_select(i32 %val) {
; CHECK-W64-NEXT: [[TID1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TID]])
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID1]])
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_self_select(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -32,6 +40,14 @@ define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) {
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -51,6 +67,15 @@ define i32 @test_wave_shuffle_dpp_row_share_7(i32 %val) {
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 48
+; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -75,6 +100,14 @@ define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(i32 %val) {
; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or i32 [[TID1]], 7
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or i32 [[TID]], 7
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -99,6 +132,14 @@ define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) {
; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -124,6 +165,15 @@ define i32 @test_wave_shuffle_dpp_row_share_w32_mask(i32 %val) {
; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 16
+; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -150,6 +200,15 @@ define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) {
; CHECK-W64-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_not_quite_row_share(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280
+; CHECK-DPP-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -169,6 +228,14 @@ define i32 @test_wave_shuffle_workitem_row_share_14(i32 %val) {
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 1008
+; CHECK-DPP-NEXT: [[SHARE_14:%.*]] = or disjoint i32 [[MASKED]], 14
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -191,6 +258,13 @@ define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
; CHECK-W64-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-DPP-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%share_14 = or i32 %tid, 14
@@ -208,6 +282,13 @@ define i32 @test_wave_shuffle_workitem_row_share_15(i32 %val) {
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-DPP-NEXT: [[SHARE_15:%.*]] = or i32 [[TID]], 15
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_15]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -226,6 +307,13 @@ define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(i32 %val) {
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-DPP-NEXT: [[SHARE_15:%.*]] = or i32 [[TID]], 15
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_15]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%share_15 = or i32 %tid, 15
>From f4bb45e0e33f221f7c14af7c283d4eed786f2441 Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Tue, 27 Jan 2026 16:16:31 -0500
Subject: [PATCH 4/7] Further PR feedback updates
In this change, I simplified the logic in tryWaveShuffleDPP to cut down on duplicate code. I also added some more context for the row share 0 test case, and added another test case to show the intended behavior being tested
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 60 ++++++++-----------
.../AMDGPU/llvm.amdgcn.wave.shuffle.ll | 29 +++++++++
2 files changed, 55 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4f8b94ae386f7..4605e91605371 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -588,57 +588,49 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
// DPP16 Row Share requires GFX10 or later
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
- // DPP17 Row Share 0: Idx = Tid & Mask
- // wave32 requires Mask & 0x1F = 0x10
- // wave64 requires Mask & 0x3F = 0x30
Value *Tid;
uint64_t Mask;
+ uint64_t RowIdx = 0;
+ bool CanDPP16Optimize = false;
+
+ // DPP16 Row Share 0: Idx = Tid & Mask
auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
+
+ // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
+ auto RowSharePred =
+ m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
+
+ // DPP16 Row Share 15: Idx = Tid | 0xF
+ auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(RowIdx));
+
if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
+ // wave32 requires Mask & 0x1F = 0x10
if (ST.isWave32() && (Mask & 0x1F) != 0x10)
return std::nullopt;
+ // wave64 requires Mask & 0x3F = 0x30
if (ST.isWave64() && (Mask & 0x3F) != 0x30)
return std::nullopt;
-
- CallInst *UpdateDPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0),
- B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
- UpdateDPP->takeName(&II);
- UpdateDPP->copyMetadata(II);
- return IC.replaceInstUsesWith(II, UpdateDPP);
+ CanDPP16Optimize = true;
}
-
- // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
- // wave32 requires Mask & 0x1F = 0x10
- // wave64 requires Mask & 0x3F = 0x30
- ConstantInt *RowIdx;
- auto RowSharePred =
- m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
- if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
- RowIdx->getZExtValue() < 15 && RowIdx->getZExtValue() > 0) {
+ else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
+ RowIdx < 15 && RowIdx > 0) {
+ // wave32 requires Mask & 0x1F = 0x10
if (ST.isWave32() && (Mask & 0x1F) != 0x10)
return std::nullopt;
+ // wave64 requires Mask & 0x3F = 0x30
if (ST.isWave64() && (Mask & 0x3F) != 0x30)
return std::nullopt;
-
- CallInst *UpdateDPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val,
- B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx->getZExtValue()),
- B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
- UpdateDPP->takeName(&II);
- UpdateDPP->copyMetadata(II);
- return IC.replaceInstUsesWith(II, UpdateDPP);
+ CanDPP16Optimize = true;
+ }
+ else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
+ RowIdx == 15) {
+ CanDPP16Optimize = true;
}
- // DPP16 Row Share 15: Idx = Tid | 0xF
- auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(RowIdx));
- if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
- RowIdx->getZExtValue() == 15) {
+ if (CanDPP16Optimize) {
CallInst *UpdateDPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR_LAST),
+ {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx),
B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
UpdateDPP->takeName(&II);
UpdateDPP->copyMetadata(II);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
index 143ec450c0b69..0a18260bccd13 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -30,6 +30,9 @@ define i32 @test_wave_shuffle_self_select(i32 %val) {
ret i32 %res
}
+; In the Row Share 0 case, the logic is the same with and without the or.
+; In fact, the or will likely be optimized out before reaching the DPP
+; optimization step anyway. So this case should work with or without the or
define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
@@ -57,6 +60,32 @@ define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) {
ret i32 %res
}
+define i32 @test_wave_shuffle_dpp_row_share_0_no_or(i32 %val) {
+; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
+; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: ret i32 [[RES]]
+;
+; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
+; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: ret i32 [[RES]]
+;
+; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
+; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]])
+; CHECK-DPP-NEXT: ret i32 [[RES]]
+;
+ %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ %masked = and i32 %tid, 65520 ; 0xFFF0
+ %res = tail call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %masked)
+ ret i32 %res
+}
+
define i32 @test_wave_shuffle_dpp_row_share_7(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
>From 93b628e3336a5cad4e117a6503c5eeae6496b84d Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Tue, 27 Jan 2026 16:20:19 -0500
Subject: [PATCH 5/7] Rename variable for clarity
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4605e91605371..123ef493ec545 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -591,7 +591,7 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
Value *Tid;
uint64_t Mask;
uint64_t RowIdx = 0;
- bool CanDPP16Optimize = false;
+ bool CanDPP16RowShare = false;
// DPP16 Row Share 0: Idx = Tid & Mask
auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
@@ -610,7 +610,7 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
// wave64 requires Mask & 0x3F = 0x30
if (ST.isWave64() && (Mask & 0x3F) != 0x30)
return std::nullopt;
- CanDPP16Optimize = true;
+ CanDPP16RowShare = true;
}
else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
RowIdx < 15 && RowIdx > 0) {
@@ -620,14 +620,14 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
// wave64 requires Mask & 0x3F = 0x30
if (ST.isWave64() && (Mask & 0x3F) != 0x30)
return std::nullopt;
- CanDPP16Optimize = true;
+ CanDPP16RowShare = true;
}
else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
RowIdx == 15) {
- CanDPP16Optimize = true;
+ CanDPP16RowShare = true;
}
- if (CanDPP16Optimize) {
+ if (CanDPP16RowShare) {
CallInst *UpdateDPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Val->getType(),
{B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx),
>From 9daf585fdc9f28c9925e8e05c4dff632a0db0426 Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Tue, 27 Jan 2026 16:26:06 -0500
Subject: [PATCH 6/7] Clang format fix
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 123ef493ec545..e64e9a454b9ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -611,9 +611,8 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
if (ST.isWave64() && (Mask & 0x3F) != 0x30)
return std::nullopt;
CanDPP16RowShare = true;
- }
- else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) &&
- RowIdx < 15 && RowIdx > 0) {
+ } else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 &&
+ RowIdx > 0) {
// wave32 requires Mask & 0x1F = 0x10
if (ST.isWave32() && (Mask & 0x1F) != 0x10)
return std::nullopt;
@@ -621,9 +620,8 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
if (ST.isWave64() && (Mask & 0x3F) != 0x30)
return std::nullopt;
CanDPP16RowShare = true;
- }
- else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
- RowIdx == 15) {
+ } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
+ RowIdx == 15) {
CanDPP16RowShare = true;
}
>From e3c0095a99af2ea067221af254b100c4498ad1db Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Mon, 2 Feb 2026 10:33:49 -0500
Subject: [PATCH 7/7] Code deduplication, testing clarity
Further PR feedback to make code clearer and reduce duplication. Change DPP call to use Poison value for out of bounds. Update negative testing for clarity
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 34 +--
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
.../AMDGPU/llvm.amdgcn.wave.shuffle.ll | 198 +++++++++---------
3 files changed, 119 insertions(+), 115 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index e64e9a454b9ef..1d6ab3f54fdfc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -586,13 +586,18 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
Value *Idx = II.getArgOperand(1);
auto &B = IC.Builder;
- // DPP16 Row Share requires GFX10 or later
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ // DPP16 Row Share requires known wave size, architecture support
+ if (ST.isWaveSizeKnown() && ST.hasDPPRowShare()) {
Value *Tid;
uint64_t Mask;
- uint64_t RowIdx = 0;
+ uint64_t RowIdx;
bool CanDPP16RowShare = false;
+ // wave32 requires Mask & 0x1F == 0x10
+ // wave64 requires Mask & 0x3F == 0x30
+ uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
+ uint64_t MaskTarget = MaskCheck & 0xF0;
+
// DPP16 Row Share 0: Idx = Tid & Mask
auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
@@ -601,34 +606,29 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
// DPP16 Row Share 15: Idx = Tid | 0xF
- auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt(RowIdx));
+ auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt<0xF>());
if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
- // wave32 requires Mask & 0x1F = 0x10
- if (ST.isWave32() && (Mask & 0x1F) != 0x10)
- return std::nullopt;
- // wave64 requires Mask & 0x3F = 0x30
- if (ST.isWave64() && (Mask & 0x3F) != 0x30)
+ if ((Mask & MaskCheck) != MaskTarget)
return std::nullopt;
+
+ RowIdx = 0;
CanDPP16RowShare = true;
} else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 &&
RowIdx > 0) {
- // wave32 requires Mask & 0x1F = 0x10
- if (ST.isWave32() && (Mask & 0x1F) != 0x10)
- return std::nullopt;
- // wave64 requires Mask & 0x3F = 0x30
- if (ST.isWave64() && (Mask & 0x3F) != 0x30)
+ if ((Mask & MaskCheck) != MaskTarget)
return std::nullopt;
+
CanDPP16RowShare = true;
- } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid) &&
- RowIdx == 15) {
+ } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid)) {
+ RowIdx = 15;
CanDPP16RowShare = true;
}
if (CanDPP16RowShare) {
CallInst *UpdateDPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Val->getType(),
- {B.getInt32(0), Val, B.getInt32(AMDGPU::DPP::ROW_SHR0 | RowIdx),
+ {PoisonValue::get(Val->getType()), Val, B.getInt32(AMDGPU::DPP::ROW_SHARE0 | RowIdx),
B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
UpdateDPP->takeName(&II);
UpdateDPP->copyMetadata(II);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d27a7384a7da1..502d9391f02b1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -475,6 +475,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasDPP && getGeneration() < GFX10;
}
+ bool hasDPPRowShare() const { return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10); }
+
// Has V_PK_MOV_B32 opcode
bool hasPkMovB32() const { return HasGFX90AInsts; }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
index 0a18260bccd13..c38f21c02276b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mattr=+dpp -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-DPP %s
+
+; DPP16 Row Share optimization depends on knowing the wavefront size, this run should skip the optimization
+; RUN: opt -mtriple=amdgcn-- -mattr=+dpp -passes=instcombine -S < %s | FileCheck -check-prefixes=CHECK-NO-WAVE-SIZE %s
define i32 @test_wave_shuffle_self_select(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_self_select(
@@ -17,12 +19,12 @@ define i32 @test_wave_shuffle_self_select(i32 %val) {
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID1]])
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_self_select(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_self_select(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[TID]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -36,21 +38,21 @@ define i32 @test_wave_shuffle_self_select(i32 %val) {
define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 336, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 336, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -63,21 +65,21 @@ define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) {
define i32 @test_wave_shuffle_dpp_row_share_0_no_or(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 336, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 272, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 336, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_0_no_or(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -89,22 +91,22 @@ define i32 @test_wave_shuffle_dpp_row_share_0_no_or(i32 %val) {
define i32 @test_wave_shuffle_dpp_row_share_7(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 343, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 343, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 48
-; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 48
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -130,13 +132,13 @@ define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(i32 %val) {
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or i32 [[TID]], 7
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_7:%.*]] = or i32 [[TID]], 7
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -151,7 +153,7 @@ define i32 @test_wave_shuffle_dpp_row_share_7_no_mask(i32 %val) {
define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 343, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
@@ -162,13 +164,13 @@ define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) {
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
-; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -183,7 +185,7 @@ define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) {
define i32 @test_wave_shuffle_dpp_row_share_w32_mask(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 279, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 343, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
@@ -195,14 +197,14 @@ define i32 @test_wave_shuffle_dpp_row_share_w32_mask(i32 %val) {
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 16
-; CHECK-DPP-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_w32_mask(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 16
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -230,14 +232,14 @@ define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) {
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_not_quite_row_share(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280
-; CHECK-DPP-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_not_quite_row_share(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280
+; CHECK-NO-WAVE-SIZE-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -250,21 +252,21 @@ define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) {
define i32 @test_wave_shuffle_workitem_row_share_14(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 350, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 286, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 350, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-DPP-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 1008
-; CHECK-DPP-NEXT: [[SHARE_14:%.*]] = or disjoint i32 [[MASKED]], 14
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 1008
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_14:%.*]] = or disjoint i32 [[MASKED]], 14
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -288,12 +290,12 @@ define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-DPP-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_14:%.*]] = or i32 [[TID]], 14
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_14]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%share_14 = or i32 %tid, 14
@@ -304,20 +306,20 @@ define i32 @test_wave_shuffle_workitem_row_share_14_no_mask(i32 %val) {
define i32 @test_wave_shuffle_workitem_row_share_15(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 351, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 351, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-DPP-NEXT: [[SHARE_15:%.*]] = or i32 [[TID]], 15
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_15]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_15:%.*]] = or i32 [[TID]], 15
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_15]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%masked = and i32 %tid, 65520 ; 0xFFF0
@@ -329,20 +331,20 @@ define i32 @test_wave_shuffle_workitem_row_share_15(i32 %val) {
define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 351, i32 15, i32 15, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[VAL]], i32 287, i32 15, i32 15, i1 false)
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 351, i32 15, i32 15, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
-; CHECK-DPP-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
-; CHECK-DPP-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-DPP-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-DPP-NEXT: [[SHARE_15:%.*]] = or i32 [[TID]], 15
-; CHECK-DPP-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_15]])
-; CHECK-DPP-NEXT: ret i32 [[RES]]
+; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_workitem_row_share_15_no_mask(
+; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_15:%.*]] = or i32 [[TID]], 15
+; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_15]])
+; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]]
;
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%share_15 = or i32 %tid, 15
More information about the llvm-commits
mailing list