[llvm] 4df4922 - AMDGPU/SDAG: Custom SETCC (i.e. ballot) is always uniform

Mon Apr 11 12:04:50 PDT 2022

Author: Nicolai Hähnle
Date: 2022-04-11T14:04:21-05:00
New Revision: 4df4922da6cd9d4f58aa17bd3a54435d947570fd

URL: https://github.com/llvm/llvm-project/commit/4df4922da6cd9d4f58aa17bd3a54435d947570fd
DIFF: https://github.com/llvm/llvm-project/commit/4df4922da6cd9d4f58aa17bd3a54435d947570fd.diff

LOG: AMDGPU/SDAG: Custom SETCC (i.e. ballot) is always uniform

The AMDGPUISD::SETCC node is like ISD::SETCC, but returns a lane mask
instead of a per-lane boolean. The lane mask is uniform.

This improves instruction selection for code patterns like
ctpop(ballot(x)), which can now use an S_BCNT1_* instruction instead
of V_BCNT_*.

GlobalISel already selects scalar instructions (an earlier commit
added a test case)..

Differential Revision: https://reviews.llvm.org/D123432

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9f21fd942e288..96240661cbed1 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -846,6 +846,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
       return true;
     return false;
+  case AMDGPUISD::SETCC: // ballot-style instruction
+    return true;
   }
   return false;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 6f0454ab2184f..e63b18216dab2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -71,21 +71,14 @@ define amdgpu_cs i32 @compare_floats(float %x, float %y) {
   ret i32 %ballot
 }
 
-define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
 ; CHECK-LABEL: ctpop_of_ballot:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
-
-; TODO: This should use a scalar s_bcnt1 instruction.
-; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
-
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b32 s0, vcc_lo
 ; CHECK-NEXT:    ; return to shader part epilog
-  %cmp = icmp ugt i32 %x, %y
+  %cmp = fcmp ogt float %x, %y
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
   %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
-  %r.i = mul i32 %x, %bcnt
-  %r = bitcast i32 %r.i to float
-  ret float %r
+  ret i32 %bcnt
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index f57a601cf12c1..473159ec20e55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -74,23 +74,15 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) {
   ret i64 %ballot
 }
 
-define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
 ; CHECK-LABEL: ctpop_of_ballot:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
-
-; TODO: This should use a scalar s_bcnt1 instruction.
-; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
-
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_hi, v1 
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
-  %cmp = icmp ugt i32 %x, %y
+  %cmp = fcmp ogt float %x, %y
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
   %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
-  %bcnt.32 = trunc i64 %bcnt to i32
-  %r.i = mul i32 %x, %bcnt.32
-  %r = bitcast i32 %r.i to float
-  ret float %r
+  ret i64 %bcnt
 }