[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Update divergence lowering tests (PR #128702)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Feb 25 03:48:07 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-amdgpu
Author: Petar Avramovic (petar-avramovic)
<details>
<summary>Changes</summary>
In preparations for implementing temporal divergence lowering for
global-isel, switch llvm-ir tests for amdgpu divergence lowering
to new reg bank select. Requires adding few simple regbanklegalize
rules for these tests to work.
---
Patch is 89.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128702.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp (+6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+27-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+38-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h (+5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll (+45-52)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+177-191)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll (+222-196)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll (+143-161)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll (+20-20)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 8d3e7829e10e1..eb2ece7bece51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
}
// Opcodes that also support S1.
+ if (Opc == G_FREEZE &&
+ MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
+ RBLHelper.applyMappingTrivial(*MI);
+ continue;
+ }
+
if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
Opc == AMDGPU::G_IMPLICIT_DEF)) {
Register Dst = MI->getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 3c007987b8494..3383175fc1bdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
return;
+ case VccExtToSel: {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ Register Src = MI.getOperand(1).getReg();
+ unsigned Opc = MI.getOpcode();
+ if (Ty == S32 || Ty == S16) {
+ auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
+ auto False = B.buildConstant({VgprRB, Ty}, 0);
+ B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
+ }
+ if (Ty == S64) {
+ auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
+ auto False = B.buildConstant({VgprRB, S32}, 0);
+ auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
+ B.buildMergeValues(
+ MI.getOperand(0).getReg(),
+ {Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
+ }
+ MI.eraseFromParent();
+ return;
+ }
case UniExtToSel: {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
auto True = B.buildConstant({SgprRB, Ty},
@@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Sgpr64:
case Vgpr64:
return LLT::scalar(64);
+ case VgprP0:
+ return LLT::pointer(0, 64);
case SgprP1:
case VgprP1:
return LLT::pointer(1, 64);
@@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return SgprRB;
case Vgpr32:
case Vgpr64:
+ case VgprP0:
case VgprP1:
case VgprP3:
case VgprP4:
@@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst(
case SgprV4S32:
case Vgpr32:
case Vgpr64:
+ case VgprP0:
case VgprP1:
case VgprP3:
case VgprP4:
@@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
// vgpr scalars, pointers and vectors
case Vgpr32:
case Vgpr64:
+ case VgprP0:
case VgprP1:
case VgprP3:
case VgprP4:
@@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
// We accept all types that can fit in some register class.
// Uniform G_PHIs have all sgpr registers.
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
- if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) {
+ if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
+ Ty == LLT::pointer(4, 64)) {
return;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index f293b3aba7b79..fd40e765a4b3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32);
case S64:
return MRI.getType(Reg) == LLT::scalar(64);
+ case P0:
+ return MRI.getType(Reg) == LLT::pointer(0, 64);
case P1:
return MRI.getType(Reg) == LLT::pointer(1, 64);
case P3:
@@ -58,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64);
case P5:
return MRI.getType(Reg) == LLT::pointer(5, 32);
+ case V4S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
case B32:
return MRI.getType(Reg).getSizeInBits() == 32;
case B64:
@@ -78,6 +82,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
case UniS64:
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
+ case UniP0:
+ return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
case UniP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
case UniP3:
@@ -104,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
case DivS64:
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
+ case DivP0:
+ return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
case DivP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
case DivP3:
@@ -431,16 +439,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
.Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
+ .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
+ .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
addRulesForGOpcs({G_SHL}, Standard)
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
// and G_FREEZE here, rest is trivially regbankselected earlier
+ addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
addRulesForGOpcs({G_CONSTANT})
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
+ addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
@@ -471,6 +484,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ZEXT, G_SEXT})
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
+ .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
@@ -525,9 +539,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format off
addRulesForGOpcs({G_LOAD})
+ .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
+
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
+ .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
@@ -556,15 +573,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format on
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
+ .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
+ .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
.Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
.Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
addRulesForGOpcs({G_STORE})
+ .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
.Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
.Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
.Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
- addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
+ .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
+
+ addRulesForGOpcs({G_PTR_ADD})
+ .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
+ .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
+ .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
+
+ addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}});
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
@@ -580,15 +608,24 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
addRulesForGOpcs({G_UITOFP})
+ .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
using namespace Intrinsic;
+ addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
+
// This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
addRulesForIOpcs({amdgcn_if_break}, Standard)
.Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
+ addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
+ .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
+
+ addRulesForIOpcs({amdgcn_readfirstlane})
+ .Any({{UniS32, _, DivS32}, {{}, {Sgpr32, None, Vgpr32}}});
+
} // end initialize rules
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 8280751e1dbdd..6bde7f2cd676d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -50,16 +50,19 @@ enum UniformityLLTOpPredicateID {
DivS64,
// pointers
+ P0,
P1,
P3,
P4,
P5,
+ UniP0,
UniP1,
UniP3,
UniP4,
UniP5,
+ DivP0,
DivP1,
DivP3,
DivP4,
@@ -124,6 +127,7 @@ enum RegBankLLTMappingApplyID {
// vgpr scalars, pointers, vectors and B-types
Vgpr32,
Vgpr64,
+ VgprP0,
VgprP1,
VgprP3,
VgprP4,
@@ -162,6 +166,7 @@ enum RegBankLLTMappingApplyID {
// vgpr. Lower it to two S32 vgpr ANDs.
enum LoweringMethodID {
DoNotLower,
+ VccExtToSel,
UniExtToSel,
VgprToVccCopy,
SplitTo32,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index c5ded11c7d323..65c96a3db5bbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
; Divergent phis that don't require lowering using lane mask merging
@@ -101,27 +101,23 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, 1
-; GFX10-NEXT: v_mov_b32_e32 v4, s5
-; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 1
+; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
-; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 1, v3
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
-; GFX10-NEXT: s_or_b32 s6, s6, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6
+; GFX10-NEXT: s_xor_b32 s5, s5, 1
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cmp_lg_u32 s5, 0
+; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -147,29 +143,25 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
+; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
-; GFX10-NEXT: v_mov_b32_e32 v8, s4
-; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_branch .LBB3_2
; GFX10-NEXT: .LBB3_1: ; %loop_body
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GFX10-NEXT: s_xor_b32 s5, s5, -1
-; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
-; GFX10-NEXT: s_or_b32 s6, s6, s7
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6
+; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_cmpk_le_i32 s6, 0x3e8
; GFX10-NEXT: s_mov_b32 s7, 1
-; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
-; GFX10-NEXT: s_cbranch_vccz .LBB3_4
+; GFX10-NEXT: s_cbranch_scc0 .LBB3_4
; GFX10-NEXT: ; %bb.3: ; %else
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: s_mov_b32 s7, 0
@@ -177,7 +169,6 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: .LBB3_4: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: s_xor_b32 s7, s7, 1
-; GFX10-NEXT: s_and_b32 s7, s7, 1
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10-NEXT: ; %bb.5: ; %if
@@ -185,8 +176,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: flat_store_dword v[4:5], v1
; GFX10-NEXT: s_branch .LBB3_1
; GFX10-NEXT: .LBB3_6: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -234,45 +225,47 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_mov_b32 s1, 0
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1]
-; GFX10-NEXT: s_mov_b32 s3, -1
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
-; GFX10-NEXT: v_xor_b32_e32 v3, 1, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: ; implicit-def: $vgpr3
+; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v4, s1
+; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3
+; GFX10-NEXT: s_add_i32 s1, s1, 4
+; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
+; GFX10-NEXT: v_readfirstlane_b32 s12, v3
+; GFX10-NEXT: s_add_i32 s3, s12, s3
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2
; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s1
-; GFX10-NEXT: .LBB4_4: ; %Flow
-; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT: s_branch .LBB4_6
+; GFX10-NEXT: .LBB4_4:
+; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: ; implicit-def: $vgpr1
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1
; GFX10-NEXT: s_cbranch_vccz .LBB4_6
; GFX10-NEXT: ; %bb.5: ; %.19
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: v_or_b32_e32 v3, 2, v1
+; GFX10-NEXT: v_or_b32_e32 v1, 2, v1
; GFX10-NEXT: .LBB4_6: ; %.22
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2
-; GFX10-NEXT: buffer_store_dword v3, v0, s[8:11], 0 offen
+; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
; GFX10-NEXT: s_endpgm
.entry:
%.0 = call i64 @llvm.amdgcn.s.getpc()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 45a1b25f12ff1..b902c23a3982e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
; This file contains various tests that have divergent i1s used outside of
; the loop. These are lane masks is sgpr and need to have correct value in
@@ -14,31 +14,28 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s6, s5, s6
-; GFX10-NEXT: ; implicit-def: $sgpr5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/128702
More information about the llvm-branch-commits
mailing list