[llvm] AMDGPU/GlobalISel: Fix inst-selection of ballot (PR #109986)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 25 07:03:04 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Petar Avramovic (petar-avramovic)
<details>
<summary>Changes</summary>
Both input and output of ballot are lane-masks:
result is lane-mask with 'S32/S64 LLT and SGPR bank'
input is lane-mask with 'S1 LLT and VCC reg bank'.
Ballot copies bits from input lane-mask for
all active lanes and puts 0 for inactive lanes.
GlobalISel did not set 0 in result for inactive lanes
for non-constant input.
---
Patch is 321.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109986.diff
64 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+41-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll (+27-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll (+27-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll (+16-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll (+24-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll (+12-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll (+26-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll (+12-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll (+13-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll (+12-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll (+12-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll (+10-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll (+18-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll (+18-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll (+20-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll (+96-48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll (+16-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+22-14)
- (modified) llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll (+75-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll (+47)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll (+83-36)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (+24-15)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (+7-5)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index febf0711c7d574..af6081f34303a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1429,34 +1429,57 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
- const auto BuildCopy = [&](Register SrcReg) {
- if (Size == STI.getWavefrontSize()) {
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(SrcReg);
- return;
- }
+ const auto BuildAnd = [&](unsigned Opcode, Register Dst, Register Src,
+ Register Exec) {
+ auto And = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
+ .addReg(Src)
+ .addReg(Exec)
+ .setOperandDead(3); // Dead scc
+ constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
+ };
- // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
- Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(SrcReg)
+ const auto BuildREG_SEQUENCE = [&](Register Dst, Register Lo, Register Hi) {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(Lo)
.addImm(AMDGPU::sub0)
- .addReg(HiReg)
+ .addReg(Hi)
.addImm(AMDGPU::sub1);
};
if (Arg) {
- const int64_t Value = Arg->Value.getSExtValue();
+ const int64_t Value = Arg->Value.getZExtValue();
if (Value == 0) {
+ // DstReg(32or64) = S_MOV 0
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
- } else if (Value == -1) // all ones
- BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
- else
+ } else if (Value == 1) {
+ if (Size == STI.getWavefrontSize()) {
+ // DstReg(32or64) = COPY EXEC
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(TRI.getExec());
+ } else {
+ // DstReg(64) = REG_SEQUENCE EXEC_LO, 0
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
+ BuildREG_SEQUENCE(DstReg, TRI.getExec(), HiReg);
+ }
+ } else
return false;
- } else
- BuildCopy(I.getOperand(2).getReg());
+ } else {
+ Register SrcReg = I.getOperand(2).getReg();
+ if (Size == STI.getWavefrontSize()) {
+ // DstReg(32or64) = AND SrcReg, EXEC
+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildAnd(AndOpc, DstReg, SrcReg, TRI.getExec());
+ } else {
+ // DstReg(64) = REG_SEQUENCE (AND SrcReg(32), EXEC_LO), 0
+ Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildAnd(AMDGPU::S_AND_B32, LoReg, SrcReg, AMDGPU::EXEC_LO);
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
+ BuildREG_SEQUENCE(DstReg, LoReg, HiReg);
+ }
+ }
I.eraseFromParent();
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index de973481f82308..3d7b715a6dbc19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -138,6 +138,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_and_b32 s4, s4, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
; CHECK-NEXT: s_and_b32 s4, s4, s5
+; CHECK-NEXT: s_and_b32 s4, s4, exec_lo
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
; CHECK-NEXT: v_writelane_b32 v0, s4, 13
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 96cab200b61cdb..f893ac8177b4a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -45,7 +46,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
; CHECK-LABEL: compare_ints:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp eq i32 %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -55,7 +57,8 @@ define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
; CHECK-LABEL: compare_int_with_constant:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
+; CHECK-NEXT: v_cmp_le_i32_e32 vcc_lo, 0x63, v0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp sge i32 %x, 99
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -65,7 +68,8 @@ define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
define amdgpu_cs i32 @compare_floats(float %x, float %y) {
; CHECK-LABEL: compare_floats:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -76,7 +80,8 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
; CHECK-LABEL: ctpop_of_ballot:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -89,7 +94,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +119,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +144,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +169,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -184,7 +193,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -210,6 +220,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -233,7 +244,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -259,6 +271,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -284,6 +297,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -315,6 +329,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -342,6 +357,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -373,6 +389,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -401,6 +418,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_le_i32 s0, 22
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index a18f843440445c..72b65f087bdad5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -46,7 +47,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) {
; CHECK-LABEL: compare_ints:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp eq i32 %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
@@ -57,7 +59,8 @@ define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
; CHECK-LABEL: compare_int_with_constant:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; CHECK-NEXT: v_cmp_ge_i32_e64 s[0:1], v0, v1
+; CHECK-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp sge i32 %x, 99
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
@@ -67,7 +70,8 @@ define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
define amdgpu_cs i64 @compare_floats(float %x, float %y) {
; CHECK-LABEL: compare_floats:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
@@ -78,7 +82,8 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
; CHECK-LABEL: ctpop_of_ballot:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; CHECK-NEXT: s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
@@ -92,7 +97,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -116,6 +122,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -140,7 +147,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -164,6 +172,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -187,7 +196,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -213,6 +223,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -236,7 +247,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -262,6 +274,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -287,6 +300,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -318,6 +332,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -345,6 +360,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -376,6 +392,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -404,6 +421,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: v_cmp_le_i64_e64 vcc, s[0:1], 22
; CHECK-NEXT: s_cbranch_vccnz .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 614f59c564df64..4593607112ba45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -166,6 +166,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1030-NEXT: s_and...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/109986
More information about the llvm-commits
mailing list