[llvm] AMDGPU/GlobalISel: Fix inst-selection of ballot (PR #109986)

Thu Oct 10 03:55:17 PDT 2024

https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/109986

>From 0c0e21bf407bb4616e7283befec8ac0aec361ee3 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 10 Oct 2024 12:54:52 +0200
Subject: [PATCH] AMDGPU/GlobalISel: Fix inst-selection of ballot

Both input and output of ballot are lane-masks:
result is lane-mask with 'S32/S64 LLT and SGPR bank'
input is lane-mask with 'S1 LLT and VCC reg bank'.
Ballot copies bits from input lane-mask for
all active lanes and puts 0 for inactive lanes.
GlobalISel did not set 0 in result for inactive lanes
for non-constant input.
---
 llvm/docs/AMDGPUUsage.rst                     |  6 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  2 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 97 ++++++++++++++-----
 .../GlobalISel/llvm.amdgcn.ballot.i32.ll      | 92 +++++++++++++++++-
 .../GlobalISel/llvm.amdgcn.ballot.i64.ll      | 60 +++++++++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll  | 77 ++++++++++++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll  | 47 +++++++++
 .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll   | 20 ++--
 8 files changed, 360 insertions(+), 41 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6ff3272422fe95..aba39762861dd8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    sign-extended from the width of the underlying PC hardware register even on
                                                    processors where the s_getpc_b64 instruction returns a zero-extended value.
 
+  llvm.amdgcn.ballot                               Returns a bitfield(i32 or i64) containing the result of its i1 argument
+                                                   in all active lanes, and zero in all inactive lanes.
+                                                   Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
+                                                   used by hardware to control active lanes when used in EXEC register.
+                                                   For example, ballot(i1 true) return EXEC mask.
+
   ==============================================   ==========================================================
 
 .. TODO::
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2738eb77b675ab..715f2cc917e21c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp :
             [IntrNoMem, IntrConvergent,
              ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// Returns a bitfield(i32 or i64) containing the result of its i1 argument
+// in all active lanes, and zero in all inactive lanes.
 def int_amdgcn_ballot :
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5be0a049cc5827..f4ac268fbfeffa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1413,50 +1413,97 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
   return true;
 }
 
+// Ballot has to zero bits in input lane-mask that are zero in current exec,
+// Done as AND with exec. For inputs that are results of instruction that
+// implicitly use same exec, for example compares in same basic block, use copy.
+bool isBallotCopy(Register Reg, MachineRegisterInfo &MRI,
+                  MachineBasicBlock *MBB) {
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  // Look through copies, truncs and anyext. TODO: just copies
+  while (MI->getOpcode() == AMDGPU::COPY ||
+         MI->getOpcode() == AMDGPU::G_TRUNC ||
+         MI->getOpcode() == AMDGPU::G_ANYEXT) {
+    Reg = MI->getOperand(1).getReg();
+    if (!Reg.isVirtual())
+      return false;
+    MI = MRI.getVRegDef(Reg);
+  }
+
+  // Lane mask generated using compare with same exec.
+  if (isa<GAnyCmp>(MI) && MI->getParent() == MBB)
+    return true;
+
+  Register LHS, RHS;
+  // Look through AND.
+  if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
+    return isBallotCopy(LHS, MRI, MBB) || isBallotCopy(RHS, MRI, MBB);
+
+  return false;
+}
+
 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
   Register DstReg = I.getOperand(0).getReg();
-  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
-  const bool Is64 = Size == 64;
-  const bool IsWave32 = (STI.getWavefrontSize() == 32);
+  Register SrcReg = I.getOperand(2).getReg();
+  const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
+  const unsigned WaveSize = STI.getWavefrontSize();
 
   // In the common case, the return type matches the wave size.
   // However we also support emitting i64 ballots in wave32 mode.
-  if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
+  if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
     return false;
 
   std::optional<ValueAndVReg> Arg =
-      getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
+      getIConstantVRegValWithLookThrough(SrcReg, *MRI);
+
+  Register Dst = DstReg;
+  // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
+  if (BallotSize != WaveSize) {
+    Dst = MRI->createVirtualRegister(TRI.getBoolRC());
+  }
 
-  const auto BuildCopy = [&](Register SrcReg) {
-    if (Size == STI.getWavefrontSize()) {
-      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
-          .addReg(SrcReg);
-      return;
+  if (Arg) {
+    const int64_t Value = Arg->Value.getZExtValue();
+    if (Value == 0) {
+      // Dst = S_MOV 0
+      unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+      BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
+    } else {
+      // Dst = COPY EXEC
+      assert(Value == 1);
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
     }
+    if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
+      return false;
+  } else {
+    if (isBallotCopy(SrcReg, *MRI, BB)) {
+      // Dst = COPY SrcReg
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
+      if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
+        return false;
+    } else {
+      // Dst = S_AND SrcReg, EXEC
+      unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+      auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
+                     .addReg(SrcReg)
+                     .addReg(TRI.getExec())
+                     .setOperandDead(3); // Dead scc
+      if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
+        return false;
+    }
+  }
 
-    // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
+  // i64 ballot on Wave32: zero-extend i32 ballot to i64.
+  if (BallotSize != WaveSize) {
     Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-        .addReg(SrcReg)
+        .addReg(Dst)
         .addImm(AMDGPU::sub0)
         .addReg(HiReg)
         .addImm(AMDGPU::sub1);
-  };
-
-  if (Arg) {
-    const int64_t Value = Arg->Value.getSExtValue();
-    if (Value == 0) {
-      unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
-      BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
-    } else if (Value == -1) // all ones
-      BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
-    else
-      return false;
-  } else
-    BuildCopy(I.getOperand(2).getReg());
+  }
 
   I.eraseFromParent();
   return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 96cab200b61cdb..46a6cde97970d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
 ; CHECK-LABEL: non_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
 ; CHECK-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -113,6 +115,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
@@ -137,7 +140,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -161,6 +165,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
@@ -419,3 +424,80 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: non_cst_non_compare_input:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:  ; %bb.3: ; %A
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: non_cst_non_compare_input:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_and_b32 s0, 1, s0
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT:  ; %bb.1: ; %B
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT:  ; %bb.3: ; %A
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:  ; %bb.4: ; %exit
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
+  store i32 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index a18f843440445c..092a960ed7c561 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
 ; CHECK-LABEL: non_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; CHECK-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -116,6 +118,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
@@ -140,7 +143,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -164,6 +168,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
@@ -422,3 +427,52 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; CHECK-LABEL: non_cst_non_compare_input:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT:  ; %bb.1: ; %B
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT:    ; implicit-def: $vgpr2
+; CHECK-NEXT:  ; %bb.2: ; %Flow
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT:  ; %bb.3: ; %A
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, 1, v2
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT:  ; %bb.4: ; %exit
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
+  store i64 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 047b35b8c0f9d8..026a8d7da7080b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
@@ -522,3 +522,76 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: non_cst_non_compare_input:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:  ; %bb.3: ; %A
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: non_cst_non_compare_input:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    ; implicit-def: $sgpr0
+; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT:  ; %bb.1: ; %B
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT:  ; %bb.3: ; %A
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:  ; %bb.4: ; %exit
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
+  store i32 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 61f0f20f057043..c7597e98a6d583 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -511,3 +511,50 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; CHECK-LABEL: non_cst_non_compare_input:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT:  ; %bb.1: ; %B
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT:    ; implicit-def: $vgpr2
+; CHECK-NEXT:  ; %bb.2: ; %Flow
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT:  ; %bb.3: ; %A
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], vcc, exec
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT:  ; %bb.4: ; %exit
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
+  store i64 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
index 5dbfdf24ef36f7..fe69dc49062435 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -40,12 +40,20 @@ define amdgpu_cs i64 @constant_true() {
 ; Test ballot of a non-comparison operation
 
 define amdgpu_cs i64 @non_compare(i32 %x) {
-; CHECK-LABEL: non_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    s_mov_b32 s1, 0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; DAGISEL-LABEL: non_compare:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; DAGISEL-NEXT:    s_mov_b32 s1, 0
+; DAGISEL-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; DAGISEL-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: non_compare:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    s_mov_b32 s1, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GISEL-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GISEL-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
   ret i64 %ballot