[llvm] AMDGPU/GlobalISel: Fix inst-selection of ballot (PR #109986)
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 03:55:17 PDT 2024
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/109986
>From 0c0e21bf407bb4616e7283befec8ac0aec361ee3 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 10 Oct 2024 12:54:52 +0200
Subject: [PATCH] AMDGPU/GlobalISel: Fix inst-selection of ballot
Both input and output of ballot are lane-masks:
result is lane-mask with 'S32/S64 LLT and SGPR bank'
input is lane-mask with 'S1 LLT and VCC reg bank'.
Ballot copies bits from input lane-mask for
all active lanes and puts 0 for inactive lanes.
GlobalISel did not set 0 in result for inactive lanes
for non-constant input.
---
llvm/docs/AMDGPUUsage.rst | 6 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 97 ++++++++++++++-----
.../GlobalISel/llvm.amdgcn.ballot.i32.ll | 92 +++++++++++++++++-
.../GlobalISel/llvm.amdgcn.ballot.i64.ll | 60 +++++++++++-
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 77 ++++++++++++++-
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 47 +++++++++
.../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 20 ++--
8 files changed, 360 insertions(+), 41 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6ff3272422fe95..aba39762861dd8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
sign-extended from the width of the underlying PC hardware register even on
processors where the s_getpc_b64 instruction returns a zero-extended value.
+ llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument
+ in all active lanes, and zero in all inactive lanes.
+ Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
+ used by hardware to control active lanes when used in EXEC register.
+ For example, ballot(i1 true) return EXEC mask.
+
============================================== ==========================================================
.. TODO::
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2738eb77b675ab..715f2cc917e21c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp :
[IntrNoMem, IntrConvergent,
ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// Returns a bitfield(i32 or i64) containing the result of its i1 argument
+// in all active lanes, and zero in all inactive lanes.
def int_amdgcn_ballot :
Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5be0a049cc5827..f4ac268fbfeffa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1413,50 +1413,97 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
return true;
}
+// Ballot has to zero bits in input lane-mask that are zero in current exec,
+// Done as AND with exec. For inputs that are results of instruction that
+// implicitly use same exec, for example compares in same basic block, use copy.
+bool isBallotCopy(Register Reg, MachineRegisterInfo &MRI,
+ MachineBasicBlock *MBB) {
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ // Look through copies, truncs and anyext. TODO: just copies
+ while (MI->getOpcode() == AMDGPU::COPY ||
+ MI->getOpcode() == AMDGPU::G_TRUNC ||
+ MI->getOpcode() == AMDGPU::G_ANYEXT) {
+ Reg = MI->getOperand(1).getReg();
+ if (!Reg.isVirtual())
+ return false;
+ MI = MRI.getVRegDef(Reg);
+ }
+
+ // Lane mask generated using compare with same exec.
+ if (isa<GAnyCmp>(MI) && MI->getParent() == MBB)
+ return true;
+
+ Register LHS, RHS;
+ // Look through AND.
+ if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
+ return isBallotCopy(LHS, MRI, MBB) || isBallotCopy(RHS, MRI, MBB);
+
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
Register DstReg = I.getOperand(0).getReg();
- const unsigned Size = MRI->getType(DstReg).getSizeInBits();
- const bool Is64 = Size == 64;
- const bool IsWave32 = (STI.getWavefrontSize() == 32);
+ Register SrcReg = I.getOperand(2).getReg();
+ const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
+ const unsigned WaveSize = STI.getWavefrontSize();
// In the common case, the return type matches the wave size.
// However we also support emitting i64 ballots in wave32 mode.
- if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
+ if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
return false;
std::optional<ValueAndVReg> Arg =
- getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
+ getIConstantVRegValWithLookThrough(SrcReg, *MRI);
+
+ Register Dst = DstReg;
+ // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
+ if (BallotSize != WaveSize) {
+ Dst = MRI->createVirtualRegister(TRI.getBoolRC());
+ }
- const auto BuildCopy = [&](Register SrcReg) {
- if (Size == STI.getWavefrontSize()) {
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(SrcReg);
- return;
+ if (Arg) {
+ const int64_t Value = Arg->Value.getZExtValue();
+ if (Value == 0) {
+ // Dst = S_MOV 0
+ unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
+ } else {
+ // Dst = COPY EXEC
+ assert(Value == 1);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
}
+ if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
+ return false;
+ } else {
+ if (isBallotCopy(SrcReg, *MRI, BB)) {
+ // Dst = COPY SrcReg
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
+ if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
+ return false;
+ } else {
+ // Dst = S_AND SrcReg, EXEC
+ unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+ auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
+ .addReg(SrcReg)
+ .addReg(TRI.getExec())
+ .setOperandDead(3); // Dead scc
+ if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
+ return false;
+ }
+ }
- // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
+ // i64 ballot on Wave32: zero-extend i32 ballot to i64.
+ if (BallotSize != WaveSize) {
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(SrcReg)
+ .addReg(Dst)
.addImm(AMDGPU::sub0)
.addReg(HiReg)
.addImm(AMDGPU::sub1);
- };
-
- if (Arg) {
- const int64_t Value = Arg->Value.getSExtValue();
- if (Value == 0) {
- unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
- BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
- } else if (Value == -1) // all ones
- BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
- else
- return false;
- } else
- BuildCopy(I.getOperand(2).getReg());
+ }
I.eraseFromParent();
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 96cab200b61cdb..46a6cde97970d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
declare i32 @llvm.ctpop.i32(i32)
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +115,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +140,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +165,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -419,3 +424,80 @@ true:
false:
ret i32 33
}
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: non_cst_non_compare_input:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_and_b32 s0, 1, s0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT: ; %bb.1: ; %B
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT: ; %bb.3: ; %A
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: ; %bb.4: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: non_cst_non_compare_input:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_and_b32 s0, 1, s0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
+; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT: ; %bb.1: ; %B
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT: ; %bb.3: ; %A
+; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: ; %bb.4: ; %exit
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
+ store i32 %ballot, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index a18f843440445c..092a960ed7c561 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -116,6 +118,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -140,7 +143,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -164,6 +168,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -422,3 +427,52 @@ true:
false:
ret i32 33
}
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; CHECK-LABEL: non_cst_non_compare_input:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT: ; %bb.1: ; %B
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
+; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT: ; implicit-def: $vgpr2
+; CHECK-NEXT: ; %bb.2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT: ; %bb.3: ; %A
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2
+; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT: ; %bb.4: ; %exit
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: s_endpgm
+entry:
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
+ store i64 %ballot, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 047b35b8c0f9d8..026a8d7da7080b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
declare i32 @llvm.ctpop.i32(i32)
@@ -522,3 +522,76 @@ true:
false:
ret i32 33
}
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: non_cst_non_compare_input:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: ; implicit-def: $sgpr0
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT: ; %bb.1: ; %B
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT: ; %bb.3: ; %A
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: ; %bb.4: ; %exit
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: non_cst_non_compare_input:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: ; implicit-def: $sgpr0
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
+; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT: ; %bb.1: ; %B
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT: ; implicit-def: $vgpr2
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT: ; %bb.3: ; %A
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: ; %bb.4: ; %exit
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
+ store i32 %ballot, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 61f0f20f057043..c7597e98a6d583 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -511,3 +511,50 @@ true:
false:
ret i32 33
}
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; CHECK-LABEL: non_cst_non_compare_input:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT: ; %bb.1: ; %B
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: ; implicit-def: $vgpr2
+; CHECK-NEXT: ; %bb.2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT: ; %bb.3: ; %A
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
+; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT: ; %bb.4: ; %exit
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT: s_endpgm
+entry:
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
+ store i64 %ballot, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
index 5dbfdf24ef36f7..fe69dc49062435 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -40,12 +40,20 @@ define amdgpu_cs i64 @constant_true() {
; Test ballot of a non-comparison operation
define amdgpu_cs i64 @non_compare(i32 %x) {
-; CHECK-LABEL: non_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; DAGISEL-LABEL: non_compare:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; DAGISEL-NEXT: s_mov_b32 s1, 0
+; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; DAGISEL-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: non_compare:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: s_mov_b32 s1, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GISEL-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GISEL-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
ret i64 %ballot
More information about the llvm-commits
mailing list