[llvm] 61df26c - AMDGPU: Add codegen test for ctpop(ballot(x))
Nicolai Hähnle via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 8 13:00:32 PDT 2022
Author: Nicolai Hähnle
Date: 2022-04-08T15:00:05-05:00
New Revision: 61df26c86cfec01317fba4f2509bc7469fff13d9
URL: https://github.com/llvm/llvm-project/commit/61df26c86cfec01317fba4f2509bc7469fff13d9
DIFF: https://github.com/llvm/llvm-project/commit/61df26c86cfec01317fba4f2509bc7469fff13d9.diff
LOG: AMDGPU: Add codegen test for ctpop(ballot(x))
Highlights a gap in DAG-based ISel where we unnecessarily choose vector
instructions. GlobalISel already looks good.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index ce8c4da2f8d8e..4a1dfd2c07bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i32 @llvm.ctpop.i32(i32)
; Test ballot(0)
@@ -69,3 +70,15 @@ define amdgpu_cs i32 @compare_floats(float %x, float %y) {
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
ret i32 %ballot
}
+
+define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = fcmp ogt float %x, %y
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+ %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
+ ret i32 %bcnt
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 5f5af2954ff56..9f83012f54575 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -verify-machineinstrs < %s | FileCheck %s
declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i64 @llvm.ctpop.i64(i64)
; Test ballot(0)
@@ -72,3 +73,16 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) {
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
ret i64 %ballot
}
+
+define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = fcmp ogt float %x, %y
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
+ ret i64 %bcnt
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 22865433a01ed..6f0454ab2184f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i32 @llvm.ctpop.i32(i32)
; Test ballot(0)
@@ -69,3 +70,22 @@ define amdgpu_cs i32 @compare_floats(float %x, float %y) {
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
ret i32 %ballot
}
+
+define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, v0, v1
+
+; TODO: This should use a scalar s_bcnt1 instruction.
+; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
+
+; CHECK-NEXT: v_bcnt_u32_b32 v1, vcc_lo, 0
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = icmp ugt i32 %x, %y
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+ %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
+ %r.i = mul i32 %x, %bcnt
+ %r = bitcast i32 %r.i to float
+ ret float %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 69066011a56c4..f57a601cf12c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i64 @llvm.ctpop.i64(i64)
; Test ballot(0)
@@ -72,3 +73,24 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) {
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
ret i64 %ballot
}
+
+define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
+
+; TODO: This should use a scalar s_bcnt1 instruction.
+; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
+
+; CHECK-NEXT: v_bcnt_u32_b32 v1, vcc_lo, 0
+; CHECK-NEXT: v_bcnt_u32_b32 v1, vcc_hi, v1
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = icmp ugt i32 %x, %y
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
+ %bcnt.32 = trunc i64 %bcnt to i32
+ %r.i = mul i32 %x, %bcnt.32
+ %r = bitcast i32 %r.i to float
+ ret float %r
+}
More information about the llvm-commits
mailing list