[llvm] cfa2d0a - [AMDGPU] Wave32 CodeGen for amdgcn.ballot.i64
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 13 06:21:03 PDT 2023
Author: pvanhout
Date: 2023-07-13T15:20:58+02:00
New Revision: cfa2d0a3aa0beb5422107dc9943cb0eae6d93896
URL: https://github.com/llvm/llvm-project/commit/cfa2d0a3aa0beb5422107dc9943cb0eae6d93896
DIFF: https://github.com/llvm/llvm-project/commit/cfa2d0a3aa0beb5422107dc9943cb0eae6d93896.diff
LOG: [AMDGPU] Wave32 CodeGen for amdgcn.ballot.i64
A recent addition to the device libs, `__ockl_dm_trim`, caused a series of
failures at O0 due to a i64 ballot intrinsic being inlined into a wave32 function.
The quick fix for this is to support codegen for this rare case.
A proper long-term fix for this type of issue is still being discussed.
Fixes SWDEV-408929, SWDEV-408957, SWDEV-409885, SWDEV-410193
Reviewed By: #amdgpu, arsenm
Differential Revision: https://reviews.llvm.org/D155050
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/VOPCInstructions.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c00095eb82638f..2aa9d9a7f04eae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1326,27 +1326,45 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
const bool Is64 = Size == 64;
+ const bool IsWave32 = (STI.getWavefrontSize() == 32);
- if (Size != STI.getWavefrontSize())
+ // In the common case, the return type matches the wave size.
+ // However we also support emitting i64 ballots in wave32 mode.
+ if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
return false;
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
+ const auto BuildCopy = [&](Register SrcReg) {
+ if (Size == STI.getWavefrontSize()) {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(SrcReg)
+ ->dump();
+ return;
+ }
+
+ // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ };
+
if (Arg) {
const int64_t Value = Arg->Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
- } else if (Value == -1) { // all ones
- Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
- } else
+ } else if (Value == -1) // all ones
+ BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+ else
return false;
- } else {
- Register SrcReg = I.getOperand(2).getReg();
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
- }
+ } else
+ BuildCopy(I.getOperand(2).getReg());
I.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 9e11775fa9f08e..f1c14460e891fc 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -992,11 +992,18 @@ multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
(i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
>;
- let WaveSizePredicate = isWave32 in
- def : GCNPat <
- (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
- (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
- >;
+ let WaveSizePredicate = isWave32 in {
+ def : GCNPat <
+ (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
+ >;
+
+ // Support codegen of i64 setcc in wave32 mode.
+ def : GCNPat <
+ (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1))
+ >;
+ }
}
defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
@@ -1056,13 +1063,22 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
DSTCLAMP.NONE), SReg_64))
>;
- let WaveSizePredicate = isWave32 in
- def : GCNPat <
- (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
- (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
- (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
- DSTCLAMP.NONE), SReg_32))
- >;
+ let WaveSizePredicate = isWave32 in {
+ def : GCNPat <
+ (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), SReg_32))
+ >;
+
+ def : GCNPat <
+ (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i64 (REG_SEQUENCE SReg_64, (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), sub0,
+ (S_MOV_B32 (i32 0)), sub1))
+ >;
+ }
}
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
new file mode 100644
index 00000000000000..109a345d7a2c88
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
+; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
+; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i64 @llvm.ctpop.i64(i64)
+
+; Test ballot(0)
+
+define amdgpu_cs i64 @constant_false() {
+; CHECK-LABEL: constant_false:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ; return to shader part epilog
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0)
+ ret i64 %ballot
+}
+
+; Test ballot(1)
+
+define amdgpu_cs i64 @constant_true() {
+; DAGISEL-LABEL: constant_true:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; DAGISEL-NEXT: s_mov_b32 s1, exec_hi
+; DAGISEL-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: constant_true:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GISEL-NEXT: s_mov_b32 s1, 0
+; GISEL-NEXT: ; return to shader part epilog
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1)
+ ret i64 %ballot
+}
+
+; Test ballot of a non-comparison operation
+
+define amdgpu_cs i64 @non_compare(i32 %x) {
+; CHECK-LABEL: non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %trunc = trunc i32 %x to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
+ ret i64 %ballot
+}
+
+; Test ballot of comparisons
+
+define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) {
+; CHECK-LABEL: compare_ints:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = icmp eq i32 %x, %y
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ ret i64 %ballot
+}
+
+define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
+; DAGISEL-LABEL: compare_int_with_constant:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0
+; DAGISEL-NEXT: s_mov_b32 s1, 0
+; DAGISEL-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: compare_int_with_constant:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
+; GISEL-NEXT: s_mov_b32 s1, 0
+; GISEL-NEXT: ; return to shader part epilog
+ %cmp = icmp sge i32 %x, 99
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ ret i64 %ballot
+}
+
+define amdgpu_cs i64 @compare_floats(float %x, float %y) {
+; CHECK-LABEL: compare_floats:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = fcmp ogt float %x, %y
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ ret i64 %ballot
+}
+
+define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = fcmp ogt float %x, %y
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
+ ret i64 %bcnt
+}
More information about the llvm-commits
mailing list