[PATCH] D123432: AMDGPU/SDAG: Custom SETCC (i.e. ballot) is always uniform

Fri Apr 8 18:50:36 PDT 2022

nhaehnle created this revision.
nhaehnle added reviewers: arsenm, foad, critson, ruiling.
Herald added subscribers: hsmhsm, kerbowa, hiraditya, t-tye, tpr, dstuttard, yaxunl, jvesely, kzhuravl.
Herald added a project: All.
nhaehnle requested review of this revision.
Herald added a subscriber: wdng.
Herald added a project: LLVM.

The AMDGPUISD::SETCC node is like ISD::SETCC, but returns a lane mask
instead of a per-lane boolean. The lane mask is uniform.

This improves instruction selection for code patterns like
ctpop(ballot(x)), which can now use an S_BCNT1_* instruction instead
of V_BCNT_*.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D123432

Files:
  llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll


Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -74,23 +74,15 @@
   ret i64 %ballot
 }
 
-define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
 ; CHECK-LABEL: ctpop_of_ballot:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
-
-; TODO: This should use a scalar s_bcnt1 instruction.
-; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
-
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_hi, v1 
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
-  %cmp = icmp ugt i32 %x, %y
+  %cmp = fcmp ogt float %x, %y
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
   %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
-  %bcnt.32 = trunc i64 %bcnt to i32
-  %r.i = mul i32 %x, %bcnt.32
-  %r = bitcast i32 %r.i to float
-  ret float %r
+  ret i64 %bcnt
 }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -71,21 +71,14 @@
   ret i32 %ballot
 }
 
-define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
 ; CHECK-LABEL: ctpop_of_ballot:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
-
-; TODO: This should use a scalar s_bcnt1 instruction.
-; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
-
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b32 s0, vcc_lo
 ; CHECK-NEXT:    ; return to shader part epilog
-  %cmp = icmp ugt i32 %x, %y
+  %cmp = fcmp ogt float %x, %y
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
   %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
-  %r.i = mul i32 %x, %bcnt
-  %r = bitcast i32 %r.i to float
-  ret float %r
+  ret i32 %bcnt
 }
Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -846,6 +846,8 @@
         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
       return true;
     return false;
+  case AMDGPUISD::SETCC: // ballot-style instruction
+    return true;
   }
   return false;
 }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D123432.421668.patch
Type: text/x-patch
Size: 2812 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220409/beb2de22/attachment.bin>