[llvm] AMDGPU/GlobalISel: RegBankLegalize rules for s_bitreplicate (PR #189138)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 17:57:40 PDT 2026
https://github.com/vangthao95 updated https://github.com/llvm/llvm-project/pull/189138
>From 34bd386a295ad0948c2f3a248b0e3218b2b3d011 Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Fri, 27 Mar 2026 21:58:22 -0400
Subject: [PATCH 1/3] AMDGPU/GlobalISel: RegBankLegalize rules for
s_bitreplicate
---
llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 4 ++++
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll | 2 +-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 198e52a6f9ae2..b4a2b94036159 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1451,6 +1451,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
using namespace Intrinsic;
+ addRulesForIOpcs({amdgcn_s_bitreplicate}, Standard)
+ .Uni(S64, {{Sgpr64}, {IntrId, Sgpr32}})
+ .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, SgprB32_ReadFirstLane}});
+
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId, Imm}}});
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
index 631fdc7406918..c075f9e624603 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
declare i64 @llvm.amdgcn.s.bitreplicate(i32)
>From 0009a5052bb41dd959f613d7c206f8de9ca357cf Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Tue, 31 Mar 2026 14:12:21 -0400
Subject: [PATCH 2/3] Change to WF and fix WF for SALUs with a copy to VGPRs
---
.../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 16 +++++++--
.../AMDGPU/AMDGPURegBankLegalizeHelper.h | 3 ++
.../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 2 +-
.../AMDGPU/llvm.amdgcn.bitreplicate.ll | 36 ++++++++++++++-----
4 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 46ff5342a7dd9..d9e0e2824a184 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -60,8 +60,9 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
WaterfallInfo WFI;
unsigned OpIdx = 0;
+ OldNextMI = std::next(MI.getIterator());
if (!Mapping->DstOpMapping.empty()) {
- B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
+ B.setInsertPt(*MI.getParent(), OldNextMI);
if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
return false;
}
@@ -1696,8 +1697,19 @@ bool RegBankLegalizeHelper::applyMappingSrc(
if (RB != SgprRB) {
WFI.SgprWaterfallOperandRegs.insert(Reg);
if (!WFI.Start.isValid()) {
+ // Waterfall range [WFI.Start, WFI.End). Use OldNextMI so that
+ // any instructions inserted by applyMappingDst are included.
WFI.Start = MI.getIterator();
- WFI.End = std::next(MI.getIterator());
+ WFI.End = OldNextMI;
+
+ // Mark any COPY as exec-dependent since machine-sink may move
+ // it out of the loop body.
+ MCRegister ExecReg = ST.getRegisterInfo()->getExec();
+ for (auto It = std::next(MI.getIterator()); It != OldNextMI; ++It) {
+ if (It->isCopy())
+ It->addOperand(MachineOperand::CreateReg(ExecReg, /*isDef=*/false,
+ /*isImp=*/true));
+ }
}
}
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 577c26e4bf02a..d5176c5811c69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -46,6 +46,9 @@ class RegBankLegalizeHelper {
MachineOptimizationRemarkEmitter MORE;
const RegBankLegalizeRules &RBLRules;
const bool IsWave32;
+ // The original next instruction after MI, saved before applyMappingDst may
+ // insert instructions. Used by Sgpr*_WF to set the waterfall range.
+ MachineBasicBlock::iterator OldNextMI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index b4a2b94036159..888f3c1fae9a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1453,7 +1453,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForIOpcs({amdgcn_s_bitreplicate}, Standard)
.Uni(S64, {{Sgpr64}, {IntrId, Sgpr32}})
- .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, SgprB32_ReadFirstLane}});
+ .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, Sgpr32_WF}});
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
index c075f9e624603..c83b199832f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
declare i64 @llvm.amdgcn.s.bitreplicate(i32)
@@ -76,13 +76,31 @@ entry:
}
define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
-; GFX11-LABEL: test_s_bitreplicate_vgpr:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: test_s_bitreplicate_vgpr:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e64 s2, v2
+; GFX11-GISEL-NEXT: s_bitreplicate_b64_b32 s[2:3], s2
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr2
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_s_bitreplicate_vgpr:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
entry:
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
ret i64 %br
>From c47e49c423004f46960103d01aff29a1da7049ea Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Wed, 1 Apr 2026 18:22:11 -0400
Subject: [PATCH 3/3] Add PHI tied operand
---
.../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 42 +++++++---
.../AMDGPU/GlobalISel/regbankselect-call.ll | 4 +-
.../regbankselect-waterfall-call.mir | 16 +++-
.../AMDGPU/llvm.amdgcn.bitreplicate.ll | 81 ++++++++++++++++++-
4 files changed, 126 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index d9e0e2824a184..a7688dd3d9da8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -202,6 +202,39 @@ bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
auto NewEnd = BodyBB->end();
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+ // Create loop-carried dependencies for VGPR defs that lack inherent exec
+ // dependency, preventing machine-sink from moving them out of the loop.
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
+ continue;
+ for (unsigned I = 0, E = MI.getNumDefs(); I < E; ++I) {
+ if (!MI.getOperand(I).isReg())
+ continue;
+ Register DefReg = MI.getOperand(I).getReg();
+ if (!DefReg.isVirtual() || MRI.getRegBank(DefReg) != VgprRB)
+ continue;
+
+ LLT Ty = MRI.getType(DefReg);
+
+ B.setInsertPt(MBB, MBB.end());
+ Register InitReg = MRI.createVirtualRegister({VgprRB, Ty});
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitReg);
+
+ Register PhiReg = MRI.createVirtualRegister({VgprRB, Ty});
+ B.setInsertPt(*LoopBB, LoopBB->begin());
+ B.buildInstr(TargetOpcode::G_PHI)
+ .addDef(PhiReg)
+ .addReg(InitReg)
+ .addMBB(&MBB)
+ .addReg(DefReg)
+ .addMBB(BodyBB);
+
+ MI.addOperand(MachineOperand::CreateReg(PhiReg, /*isDef=*/false,
+ /*isImp=*/true));
+ MI.tieOperands(I, MI.getNumOperands() - 1);
+ }
+ }
+
B.setMBB(*LoopBB);
Register CondReg;
@@ -1701,15 +1734,6 @@ bool RegBankLegalizeHelper::applyMappingSrc(
// any instructions inserted by applyMappingDst are included.
WFI.Start = MI.getIterator();
WFI.End = OldNextMI;
-
- // Mark any COPY as exec-dependent since machine-sink may move
- // it out of the loop body.
- MCRegister ExecReg = ST.getRegisterInfo()->getExec();
- for (auto It = std::next(MI.getIterator()); It != OldNextMI; ++It) {
- if (It->isCopy())
- It->addOperand(MachineOperand::CreateReg(ExecReg, /*isDef=*/false,
- /*isImp=*/true));
- }
}
}
break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-call.ll
index cb06627c87ad5..4960e6b28d736 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-call.ll
@@ -76,11 +76,13 @@ define amdgpu_ps i32 @test_divergent_indirect_call_p0_with_args(ptr %fptr, i32 %
; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr(s32) = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF1]](s32), %bb.1, %5(s32), %bb.3
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](p0)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
@@ -97,7 +99,7 @@ define amdgpu_ps i32 @test_divergent_indirect_call_p0_with_args(ptr %fptr, i32 %
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV1]](p0), 0, csr_amdgpu_si_gfx_gfx90ainsts, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0, implicit [[PHI]](tied-def 0)(s32)
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir
index 5207d992ea74d..934f6f72452ad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir
@@ -13,11 +13,13 @@ body: |
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr(p0) = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: .1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(p0) = G_PHI [[DEF1]](p0), %bb.0, %2(p0), %bb.2
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p0)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
@@ -31,7 +33,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: %g_ptr:sgpr(p0) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0), implicit [[PHI]](tied-def 0)(p0)
; CHECK-NEXT: %func_ptr:vgpr(p0) = G_LOAD [[COPY]](p0) :: (load (p0))
; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -66,11 +68,13 @@ body: |
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr(p4) = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: .1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[DEF1]](p4), %bb.0, %2(p4), %bb.2
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p4)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
@@ -84,7 +88,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: %g_ptr:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4), implicit [[PHI]](tied-def 0)(p4)
; CHECK-NEXT: %func_ptr:vgpr(p4) = G_LOAD [[COPY]](p4) :: (load (p4))
; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p4), 0, csr_amdgpu
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -119,11 +123,13 @@ body: |
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr(p0) = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: .1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(p0) = G_PHI [[DEF1]](p0), %bb.0, %2(p0), %bb.2
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p0)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
@@ -137,7 +143,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: %g_ptr:sgpr(p0) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0), implicit [[PHI]](tied-def 0)(p0)
; CHECK-NEXT: %func_ptr:vgpr(p0) = G_LOAD [[COPY]](p0) :: (load (p0))
; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
@@ -172,11 +178,13 @@ body: |
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr(p4) = IMPLICIT_DEF
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: .1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[DEF1]](p4), %bb.0, %2(p4), %bb.2
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p4)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
@@ -190,7 +198,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: %g_ptr:sgpr(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4), implicit [[PHI]](tied-def 0)(p4)
; CHECK-NEXT: %func_ptr:vgpr(p4) = G_LOAD [[COPY]](p4) :: (load (p4))
; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p4), 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
index c83b199832f7d..85abbe44f58c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -75,21 +75,96 @@ entry:
ret void
}
+define amdgpu_cs void @test_s_bitreplicate_vgpr_store(i32 %mask, ptr addrspace(1) %out) {
+; GFX11-GISEL-LABEL: test_s_bitreplicate_vgpr_store:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX11-GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e64 s2, v0
+; GFX11-GISEL-NEXT: s_bitreplicate_b64_b32 s[2:3], s2
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, s3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, s2
+; GFX11-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT: global_store_b64 v[1:2], v[3:4], off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_s_bitreplicate_vgpr_store:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-SDAG-NEXT: global_store_b64 v[1:2], v[3:4], off
+; GFX11-SDAG-NEXT: s_endpgm
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+ store i64 %br, ptr addrspace(1) %out
+ ret void
+}
+
+define i64 @test_s_bitreplicate_vgpr_multi_use(i32 %mask) {
+; GFX11-GISEL-LABEL: test_s_bitreplicate_vgpr_multi_use:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX11-GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e64 s2, v0
+; GFX11-GISEL-NEXT: s_bitreplicate_b64_b32 s[2:3], s2
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; GFX11-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_s_bitreplicate_vgpr_multi_use:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+ %lo = trunc i64 %br to i32
+ %hi = lshr i64 %br, 32
+ %hi32 = trunc i64 %hi to i32
+ %sum = add i32 %lo, %hi32
+ %result = zext i32 %sum to i64
+ ret i64 %result
+}
+
define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
; GFX11-GISEL-LABEL: test_s_bitreplicate_vgpr:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX11-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e64 s2, v2
; GFX11-GISEL-NEXT: s_bitreplicate_b64_b32 s[2:3], s2
; GFX11-GISEL-NEXT: ; implicit-def: $vgpr2
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX11-GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s1
-; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-GISEL-NEXT: ; %bb.2:
; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list