[llvm] 61df26c - AMDGPU: Add codegen test for ctpop(ballot(x))

Fri Apr 8 13:00:32 PDT 2022

Author: Nicolai Hähnle
Date: 2022-04-08T15:00:05-05:00
New Revision: 61df26c86cfec01317fba4f2509bc7469fff13d9

URL: https://github.com/llvm/llvm-project/commit/61df26c86cfec01317fba4f2509bc7469fff13d9
DIFF: https://github.com/llvm/llvm-project/commit/61df26c86cfec01317fba4f2509bc7469fff13d9.diff

LOG: AMDGPU: Add codegen test for ctpop(ballot(x))

Highlights a gap in DAG-based ISel where we unnecessarily choose vector
instructions. GlobalISel already looks good.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index ce8c4da2f8d8e..4a1dfd2c07bd0 100644

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i32 @llvm.ctpop.i32(i32)
 
 ; Test ballot(0)
 
@@ -69,3 +70,15 @@ define amdgpu_cs i32 @compare_floats(float %x, float %y) {
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
   ret i32 %ballot
 }
+
+define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b32 s0, vcc_lo
+; CHECK-NEXT:    ; return to shader part epilog
+  %cmp = fcmp ogt float %x, %y
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
+  ret i32 %bcnt
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 5f5af2954ff56..9f83012f54575 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -verify-machineinstrs < %s | FileCheck %s
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i64 @llvm.ctpop.i64(i64)
 
 ; Test ballot(0)
 
@@ -72,3 +73,16 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) {
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
   ret i64 %ballot
 }
+
+define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    ; return to shader part epilog
+  %cmp = fcmp ogt float %x, %y
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
+  ret i64 %bcnt
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 22865433a01ed..6f0454ab2184f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i32 @llvm.ctpop.i32(i32)
 
 ; Test ballot(0)
 
@@ -69,3 +70,22 @@ define amdgpu_cs i32 @compare_floats(float %x, float %y) {
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
   ret i32 %ballot
 }
+
+define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
+
+; TODO: This should use a scalar s_bcnt1 instruction.
+; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
+
+; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    ; return to shader part epilog
+  %cmp = icmp ugt i32 %x, %y
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
+  %r.i = mul i32 %x, %bcnt
+  %r = bitcast i32 %r.i to float
+  ret float %r
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 69066011a56c4..f57a601cf12c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i64 @llvm.ctpop.i64(i64)
 
 ; Test ballot(0)
 
@@ -72,3 +73,24 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) {
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
   ret i64 %ballot
 }
+
+define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+; CHECK-LABEL: ctpop_of_ballot:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
+
+; TODO: This should use a scalar s_bcnt1 instruction.
+; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
+
+; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
+; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_hi, v1 
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    ; return to shader part epilog
+  %cmp = icmp ugt i32 %x, %y
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
+  %bcnt.32 = trunc i64 %bcnt to i32
+  %r.i = mul i32 %x, %bcnt.32
+  %r = bitcast i32 %r.i to float
+  ret float %r
+}