[clang] Improve selection of conditional branch on amdgcn.ballot!=0 condition in SelectionDAG. (PR #68714)
Valery Pykhtin via cfe-commits
cfe-commits at lists.llvm.org
Wed Oct 25 08:27:46 PDT 2023
https://github.com/vpykhtin updated https://github.com/llvm/llvm-project/pull/68714
>From 0b1413855051c9f9672a5fca7544783a47d44bc0 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Tue, 10 Oct 2023 14:13:26 +0200
Subject: [PATCH 1/6] [AMDGPU] Add tests for conditional branch on
amdgcn.ballot eq/ne zero.
---
.../GlobalISel/llvm.amdgcn.ballot.i32.ll | 310 ++++++++++++++++++
.../GlobalISel/llvm.amdgcn.ballot.i64.ll | 310 ++++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 308 +++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 308 +++++++++++++++++
4 files changed, 1236 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 8bd1be04650e005..6c12329930b8a2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -83,3 +83,313 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
%bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
ret i32 %bcnt
}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, 1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, 1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 9f83012f5457509..ebb96ddc0603d68 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -86,3 +86,313 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
%bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
ret i64 %bcnt
}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, 1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, 1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 7e719e86b521fd8..a78fdfd94512fde 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -83,3 +83,311 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
%bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
ret i32 %bcnt
}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, -1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, -1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 473159ec20e5524..e6415f6982ac83f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -86,3 +86,311 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
%bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
ret i64 %bcnt
}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB7_3
+; CHECK-NEXT: .LBB7_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB8_3
+; CHECK-NEXT: .LBB8_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB9_3
+; CHECK-NEXT: .LBB9_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_3:
+ %c = trunc i32 %v to i1
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB11_3
+; CHECK-NEXT: .LBB11_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB12_3
+; CHECK-NEXT: .LBB12_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB13_3
+; CHECK-NEXT: .LBB13_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB15_3
+; CHECK-NEXT: .LBB15_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB16_3
+; CHECK-NEXT: .LBB16_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB17_3
+; CHECK-NEXT: .LBB17_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
>From 13ebb871f82fd4dfa9eff7ecae6c0d60d6ebab16 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Tue, 10 Oct 2023 14:35:16 +0200
Subject: [PATCH 2/6] [AMDGPU] Improve selection of conditional branch on
amdgcn.ballot!=0 in SelectionDAG.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 31 ++++++++++
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 14 +++++
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 6 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 55 +++++++++++++++++-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 56 +++++++------------
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 56 +++++++------------
9 files changed, 146 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..90addb12a81abcf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -666,6 +666,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::FP_EXTEND:
SelectFP_EXTEND(N);
return;
+ case AMDGPUISD::BRCONDZ:
+ SelectBRCONDZ(N);
+ return;
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_PKNORM_I16_F32:
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -2306,6 +2309,34 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}
+void AMDGPUDAGToDAGISel::SelectBRCONDZ(SDNode *N) {
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+
+ SDValue Cond = N->getOperand(1);
+
+ // BRCONDZ condition is either AMDGPUISD::SETCC or i1 value that comes from
+ // ISD::SETCC node or logical combination of ISD::SETCCs therefore we don't
+ // need to AND the condition with execmask.
+
+ // TODO: AMDGPUISD::SETCC is always selected as V_CMP so use VCC condition.
+ // This might change later.
+ bool UseSCCBr = Cond->getOpcode() != AMDGPUISD::SETCC && !Cond->isDivergent();
+
+ auto CondCode = cast<CondCodeSDNode>(N->getOperand(3))->get();
+ assert(CondCode == ISD::SETEQ || CondCode == ISD::SETNE);
+
+ bool EqZero = CondCode == ISD::SETEQ;
+ unsigned BrOp =
+ UseSCCBr ? (EqZero ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
+ : (EqZero ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
+
+ SDValue CondCopy = CurDAG->getCopyToReg(
+ N->getOperand(0), SDLoc(N), UseSCCBr ? AMDGPU::SCC : TRI->getVCC(),
+ N->getOperand(1));
+ CurDAG->SelectNodeTo(N, BrOp, MVT::Other, N->getOperand(2), CondCopy);
+}
+
void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
!N->isDivergent()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..255ca62cb7a9100 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -272,6 +272,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
+ void SelectBRCONDZ(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 607d59db7bcf709..a268a807679f473 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5317,6 +5317,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
+ NODE_NAME_CASE(BRCONDZ)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e971c85ee3f6e39..43e572dcc56423d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -422,6 +422,20 @@ enum NodeType : unsigned {
// This is SETCC with the full mask result which is used for a compare with a
// result bit per item in the wavefront.
SETCC,
+
+ // Conditional branch on comparison of CondWaveMask operand to zero.
+ // BRCONDZ CondWaveMask, BB, CondCode
+ // where:
+ // - CondWaveMask - is either:
+ // * the i32/i64 result of AMDGPUISD::SETCC node,
+ // * i1 value that comes from ISD::SETCC node or logical combination of
+ // ISD::SETCCs. For a divergent node this becomes a i32/i64 value after
+ // selection.
+ // - BB is the target basic block,
+ // - CondCode is either SETEQ or SETNE meaning that the branch should happen
+ // if the CondWaveMask is either equal to zero or not.
+ BRCONDZ,
+
SETREG,
DENORM_MODE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 2492a7be651f6d6..a6b4fa4937dff30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -58,6 +58,11 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
>;
+def AMDGPUBrcondzOp : SDTypeProfile<0, 3, [
+ // cond, bb, cc
+ SDTCisInt<0>, SDTCisVT<1, OtherVT>, SDTCisVT<2, OtherVT>
+]>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -65,6 +70,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+def AMDGPUbrcondz: SDNode<"AMDGPUISD::BRCONDZ", AMDGPUBrcondzOp, [SDNPHasChain]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START",
SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a32844bdb01a09..5edf1446ca34dac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -798,7 +798,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SIGN_EXTEND_INREG,
ISD::EXTRACT_VECTOR_ELT,
ISD::INSERT_VECTOR_ELT,
- ISD::FCOPYSIGN});
+ ISD::FCOPYSIGN,
+ ISD::BRCOND});
if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
setTargetDAGCombine(ISD::FP_ROUND);
@@ -13584,6 +13585,56 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
+SDValue SITargetLowering::performBRCondCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue(N, 0);
+
+ SDValue Cond = N->getOperand(1);
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
+
+ // %VCMP = i32/i64 AMDGPUISD::SETCC ...
+ // %C = ISD::SETCC %VCMP, 0, setne/seteq
+ // BRCOND %BB, %C
+ // =>
+ // %VCMP = i32/i64 AMDGPUISD::SETCC ...
+ // BRCONDZ %BB, %VCMP, setne/seteq
+
+ auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
+ auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
+
+ auto VCMP = Cond->getOperand(0);
+ auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
+ auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
+ auto Src = VCMP;
+ if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
+
+ // Special case for amdgcn.ballot:
+ // %VCMPSrc = ISD::SETCC or a logical combination of ISD::SETCCs
+ // %VCMP = i32/i64 AMDGPUISD::SETCC (ext %VCMPSrc), 0, setne
+ // %C = ISD::SETCC %VCMP, 0, setne/seteq
+ // BRCOND %BB, %C
+ // =>
+ // BRCONDZ %BB, %VCMPSrc, setne/seteq
+
+ auto VCMPSrc = VCMP.getOperand(0);
+ if (ISD::isExtOpcode(VCMPSrc->getOpcode())) // Skip extension.
+ VCMPSrc = VCMPSrc.getOperand(0);
+
+ if (isBoolSGPR(VCMPSrc))
+ Src = VCMPSrc;
+ }
+ return DCI.DAG.getNode(AMDGPUISD::BRCONDZ, SDLoc(N), N->getVTList(),
+ N->getOperand(0), // Chain
+ Src,
+ N->getOperand(2), // BB
+ DCI.DAG.getCondCode(CC)); // SETEQ|SETNE
+ }
+ }
+ return SDValue(N, 0);
+}
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -13694,6 +13745,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performInsertVectorEltCombine(N, DCI);
case ISD::FP_ROUND:
return performFPRoundCombine(N, DCI);
+ case ISD::BRCOND:
+ return performBRCondCombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d717e12d29a514a..f03b83705d14083 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -220,6 +220,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performBRCondCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index a78fdfd94512fde..6ac5a6b5e0611f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -89,8 +89,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: s_cbranch_vccz .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
@@ -112,9 +111,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -137,8 +135,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: s_cbranch_vccz .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
@@ -160,9 +157,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
@@ -184,8 +180,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: s_cbranch_vccz .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
@@ -206,9 +201,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -230,8 +224,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: s_cbranch_vccz .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
@@ -252,9 +245,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
@@ -277,11 +269,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT: s_cbranch_vccz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
@@ -309,10 +298,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -337,11 +324,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT: s_cbranch_vccz .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
@@ -369,9 +353,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index e6415f6982ac83f..9b8a43fddd78c96 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -92,8 +92,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: s_cbranch_vccz .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
@@ -115,9 +114,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -140,8 +138,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: s_cbranch_vccz .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
@@ -163,9 +160,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
@@ -187,8 +183,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: s_cbranch_vccz .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
@@ -209,9 +204,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -233,8 +227,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: s_cbranch_vccz .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
@@ -255,9 +248,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
@@ -280,11 +272,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
@@ -312,10 +301,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -340,11 +327,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccz .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
@@ -372,9 +356,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
>From a17e48fb33fbc5c3abf6aea9dd0a6d4ec5feb1ed Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Sat, 14 Oct 2023 17:08:28 +0200
Subject: [PATCH 3/6] Revert "[AMDGPU] Improve selection of conditional branch
on amdgcn.ballot!=0 in SelectionDAG."
This reverts commit 13ebb871f82fd4dfa9eff7ecae6c0d60d6ebab16.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 31 ----------
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 -
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 -
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 14 -----
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 6 --
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 55 +-----------------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 56 ++++++++++++-------
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 56 ++++++++++++-------
9 files changed, 75 insertions(+), 146 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 90addb12a81abcf..b5ceaaa14b4fd5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -666,9 +666,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::FP_EXTEND:
SelectFP_EXTEND(N);
return;
- case AMDGPUISD::BRCONDZ:
- SelectBRCONDZ(N);
- return;
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_PKNORM_I16_F32:
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -2309,34 +2306,6 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}
-void AMDGPUDAGToDAGISel::SelectBRCONDZ(SDNode *N) {
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-
- SDValue Cond = N->getOperand(1);
-
- // BRCONDZ condition is either AMDGPUISD::SETCC or i1 value that comes from
- // ISD::SETCC node or logical combination of ISD::SETCCs therefore we don't
- // need to AND the condition with execmask.
-
- // TODO: AMDGPUISD::SETCC is always selected as V_CMP so use VCC condition.
- // This might change later.
- bool UseSCCBr = Cond->getOpcode() != AMDGPUISD::SETCC && !Cond->isDivergent();
-
- auto CondCode = cast<CondCodeSDNode>(N->getOperand(3))->get();
- assert(CondCode == ISD::SETEQ || CondCode == ISD::SETNE);
-
- bool EqZero = CondCode == ISD::SETEQ;
- unsigned BrOp =
- UseSCCBr ? (EqZero ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
- : (EqZero ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
-
- SDValue CondCopy = CurDAG->getCopyToReg(
- N->getOperand(0), SDLoc(N), UseSCCBr ? AMDGPU::SCC : TRI->getVCC(),
- N->getOperand(1));
- CurDAG->SelectNodeTo(N, BrOp, MVT::Other, N->getOperand(2), CondCopy);
-}
-
void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
!N->isDivergent()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 255ca62cb7a9100..a8a606f60a3faee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -272,7 +272,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
- void SelectBRCONDZ(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a268a807679f473..607d59db7bcf709 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5317,7 +5317,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
- NODE_NAME_CASE(BRCONDZ)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 43e572dcc56423d..e971c85ee3f6e39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -422,20 +422,6 @@ enum NodeType : unsigned {
// This is SETCC with the full mask result which is used for a compare with a
// result bit per item in the wavefront.
SETCC,
-
- // Conditional branch on comparison of CondWaveMask operand to zero.
- // BRCONDZ CondWaveMask, BB, CondCode
- // where:
- // - CondWaveMask - is either:
- // * the i32/i64 result of AMDGPUISD::SETCC node,
- // * i1 value that comes from ISD::SETCC node or logical combination of
- // ISD::SETCCs. For a divergent node this becomes a i32/i64 value after
- // selection.
- // - BB is the target basic block,
- // - CondCode is either SETEQ or SETNE meaning that the branch should happen
- // if the CondWaveMask is either equal to zero or not.
- BRCONDZ,
-
SETREG,
DENORM_MODE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index a6b4fa4937dff30..2492a7be651f6d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -58,11 +58,6 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
>;
-def AMDGPUBrcondzOp : SDTypeProfile<0, 3, [
- // cond, bb, cc
- SDTCisInt<0>, SDTCisVT<1, OtherVT>, SDTCisVT<2, OtherVT>
-]>;
-
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -70,7 +65,6 @@ def AMDGPUBrcondzOp : SDTypeProfile<0, 3, [
def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
-def AMDGPUbrcondz: SDNode<"AMDGPUISD::BRCONDZ", AMDGPUBrcondzOp, [SDNPHasChain]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START",
SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5edf1446ca34dac..0a32844bdb01a09 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -798,8 +798,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SIGN_EXTEND_INREG,
ISD::EXTRACT_VECTOR_ELT,
ISD::INSERT_VECTOR_ELT,
- ISD::FCOPYSIGN,
- ISD::BRCOND});
+ ISD::FCOPYSIGN});
if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
setTargetDAGCombine(ISD::FP_ROUND);
@@ -13585,56 +13584,6 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
-SDValue SITargetLowering::performBRCondCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- if (!DCI.isAfterLegalizeDAG())
- return SDValue(N, 0);
-
- SDValue Cond = N->getOperand(1);
- if (Cond.getOpcode() == ISD::SETCC &&
- Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
-
- // %VCMP = i32/i64 AMDGPUISD::SETCC ...
- // %C = ISD::SETCC %VCMP, 0, setne/seteq
- // BRCOND %BB, %C
- // =>
- // %VCMP = i32/i64 AMDGPUISD::SETCC ...
- // BRCONDZ %BB, %VCMP, setne/seteq
-
- auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
- auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
- if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
-
- auto VCMP = Cond->getOperand(0);
- auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
- auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
- auto Src = VCMP;
- if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
-
- // Special case for amdgcn.ballot:
- // %VCMPSrc = ISD::SETCC or a logical combination of ISD::SETCCs
- // %VCMP = i32/i64 AMDGPUISD::SETCC (ext %VCMPSrc), 0, setne
- // %C = ISD::SETCC %VCMP, 0, setne/seteq
- // BRCOND %BB, %C
- // =>
- // BRCONDZ %BB, %VCMPSrc, setne/seteq
-
- auto VCMPSrc = VCMP.getOperand(0);
- if (ISD::isExtOpcode(VCMPSrc->getOpcode())) // Skip extension.
- VCMPSrc = VCMPSrc.getOperand(0);
-
- if (isBoolSGPR(VCMPSrc))
- Src = VCMPSrc;
- }
- return DCI.DAG.getNode(AMDGPUISD::BRCONDZ, SDLoc(N), N->getVTList(),
- N->getOperand(0), // Chain
- Src,
- N->getOperand(2), // BB
- DCI.DAG.getCondCode(CC)); // SETEQ|SETNE
- }
- }
- return SDValue(N, 0);
-}
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -13745,8 +13694,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performInsertVectorEltCombine(N, DCI);
case ISD::FP_ROUND:
return performFPRoundCombine(N, DCI);
- case ISD::BRCOND:
- return performBRCondCombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f03b83705d14083..d717e12d29a514a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -220,7 +220,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performBRCondCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 6ac5a6b5e0611f8..a78fdfd94512fde 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -89,7 +89,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB7_2
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
@@ -111,8 +112,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB8_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -135,7 +137,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB9_2
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
@@ -157,8 +160,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
@@ -180,7 +184,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB11_2
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
@@ -201,8 +206,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB12_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -224,7 +230,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB13_2
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
@@ -245,8 +252,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
@@ -269,8 +277,11 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
-; CHECK-NEXT: s_cbranch_vccz .LBB15_2
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
@@ -298,8 +309,10 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -324,8 +337,11 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
-; CHECK-NEXT: s_cbranch_vccz .LBB17_2
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
@@ -353,7 +369,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 9b8a43fddd78c96..e6415f6982ac83f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -92,7 +92,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB7_2
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
@@ -114,8 +115,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB8_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -138,7 +140,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB9_2
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
@@ -160,8 +163,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
@@ -183,7 +187,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB11_2
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
@@ -204,8 +209,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB12_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -227,7 +233,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB13_2
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
@@ -248,8 +255,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
@@ -272,8 +280,11 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
-; CHECK-NEXT: s_cbranch_vccz .LBB15_2
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
@@ -301,8 +312,10 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -327,8 +340,11 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
-; CHECK-NEXT: s_cbranch_vccz .LBB17_2
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
@@ -356,7 +372,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
>From 8b836e298ea79bd3f1c37d1a0b24b393f9a2b2a4 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Sun, 15 Oct 2023 13:16:08 +0200
Subject: [PATCH 4/6] Simplified version without using DAG combiner.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 61 +++++++++++++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
.../GlobalISel/llvm.amdgcn.ballot.i32.ll | 26 ++++++
.../GlobalISel/llvm.amdgcn.ballot.i64.ll | 26 ++++++
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 79 ++++++++++---------
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 79 ++++++++++---------
6 files changed, 196 insertions(+), 77 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..82440dd2019ae7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2259,6 +2259,30 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
return false;
}
+bool isBoolSGPR(SDValue V);
+
+static SDValue combineBallotPattern(SDValue VCMP) {
+ assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
+ // Special case for amdgcn.ballot:
+ // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne ; lowered ballot
+ // =>
+ // Use i1 %Cond value instead of i(WaveSize) %VCMP.
+ // This is possible because divergent ISD::SETCC is selected as V_CMP and
+ // Cond becomes a i(WaveSize) full mask value.
+ auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
+ auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
+ if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
+ auto Cond = VCMP.getOperand(0);
+ if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
+ Cond = Cond.getOperand(0);
+
+ if (isBoolSGPR(Cond))
+ return Cond;
+ }
+ return SDValue();
+}
+
void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
SDValue Cond = N->getOperand(1);
@@ -2272,11 +2296,44 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
const SIRegisterInfo *TRI = ST->getRegisterInfo();
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
- unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
+ bool AndExec = !UseSCCBr;
+ bool Negate = false;
+
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
+ auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
+ auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+ // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
+ // =>
+ // Use "i(WaveSize) %VCMP value in VCC register ne/eq zero" as the branch
+ // condition.
+ Negate = CC == ISD::SETEQ;
+ auto VCMP = Cond->getOperand(0);
+ if (auto BallotCond = combineBallotPattern(VCMP)) {
+ Cond = BallotCond;
+ UseSCCBr = !BallotCond->isDivergent();
+ } else {
+ // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
+ // selected as V_CMP, but this may change for uniform condition.
+ Cond = VCMP;
+ UseSCCBr = false;
+ }
+ }
+ // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
+ // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
+ // used.
+ AndExec = false;
+ }
+
+ unsigned BrOp =
+ UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
+ : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);
- if (!UseSCCBr) {
+ if (AndExec) {
// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
// analyzed what generates the vcc value, so we do not know whether vcc
// bits for disabled lanes are 0. Thus we need to mask out bits for
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a32844bdb01a09..c800416157b06b2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10501,7 +10501,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
// Returns true if argument is a boolean value which is not serialized into
// memory or argument and does not require v_cndmask_b32 to be deserialized.
-static bool isBoolSGPR(SDValue V) {
+bool isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 6c12329930b8a2c..34ff84d5aac36a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -393,3 +393,29 @@ true:
false:
ret i32 33
}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_cmp_le_i32 s0, 22
+; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %bc = icmp sgt i32 %ballot, 22
+ br i1 %bc, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index ebb96ddc0603d68..d879779962dfcad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -396,3 +396,29 @@ true:
false:
ret i32 33
}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, 1, 0
+; CHECK-NEXT: s_and_b32 s0, 1, s0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: v_cmp_le_i64_e64 vcc, s[0:1], 22
+; CHECK-NEXT: s_cbranch_vccnz .LBB19_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %bc = icmp sgt i64 %ballot, 22
+ br i1 %bc, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index a78fdfd94512fde..951221cb342a098 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -89,8 +89,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: s_cbranch_vccz .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
@@ -112,9 +111,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -137,8 +135,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: s_cbranch_vccz .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
@@ -160,9 +157,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
@@ -184,8 +180,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: s_cbranch_vccz .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
@@ -206,9 +201,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -230,8 +224,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: s_cbranch_vccz .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
@@ -252,9 +245,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
@@ -277,11 +269,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT: s_cbranch_vccz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
@@ -309,10 +298,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -337,11 +324,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT: s_cbranch_vccz .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
@@ -369,9 +353,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -391,3 +373,26 @@ true:
false:
ret i32 33
}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
+; CHECK-NEXT: s_cmp_lt_i32 s0, 23
+; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
+ %bc = icmp sgt i32 %ballot, 22
+ br i1 %bc, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index e6415f6982ac83f..b9dafcb9a1ca9a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -92,8 +92,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT: s_cbranch_vccz .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
@@ -115,9 +114,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -140,8 +138,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: s_cbranch_vccz .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
@@ -163,9 +160,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
+; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
+; CHECK-NEXT: s_cbranch_vccz .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
@@ -187,8 +183,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
+; CHECK-NEXT: s_cbranch_vccz .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
@@ -209,9 +204,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -233,8 +227,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: s_cbranch_vccz .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
@@ -255,9 +248,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
+; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
+; CHECK-NEXT: s_cbranch_vccz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
@@ -280,11 +272,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
+; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
@@ -312,10 +301,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -340,11 +327,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccz .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
@@ -372,9 +356,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -394,3 +376,26 @@ true:
false:
ret i32 33
}
+
+define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
+; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, 12
+; CHECK-NEXT: v_cmp_lt_i64_e64 vcc, s[0:1], 23
+; CHECK-NEXT: s_cbranch_vccnz .LBB19_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB19_3
+; CHECK-NEXT: .LBB19_3:
+ %c = icmp ult i32 %v, 12
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
+ %bc = icmp sgt i64 %ballot, 22
+ br i1 %bc, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
>From 50ab57303ac324ad726e53e19bcf100b0088d7af Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Wed, 25 Oct 2023 16:00:48 +0200
Subject: [PATCH 5/6] per comments: * supported negated ballot * improved
comments
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 27 +++--
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 +
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 112 ++++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 112 ++++++++++++++++++
5 files changed, 247 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 82440dd2019ae7d..028336510cb0b66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600RegisterInfo.h"
+#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -2259,26 +2260,30 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
return false;
}
-bool isBoolSGPR(SDValue V);
-
-static SDValue combineBallotPattern(SDValue VCMP) {
+static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
// Special case for amdgcn.ballot:
// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
- // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne ; lowered ballot
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
// =>
// Use i1 %Cond value instead of i(WaveSize) %VCMP.
// This is possible because divergent ISD::SETCC is selected as V_CMP and
// Cond becomes a i(WaveSize) full mask value.
+ // Note that ballot doesn't use SETEQ condition but its easy to support it
+ // here for completeness, so in this case Negate is set true on return.
auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
- if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
+ if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && VCMP_CRHS &&
+ VCMP_CRHS->isZero()) {
+
auto Cond = VCMP.getOperand(0);
if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
Cond = Cond.getOperand(0);
- if (isBoolSGPR(Cond))
+ if (isBoolSGPR(Cond)) {
+ Negate = VCMP_CC == ISD::SETEQ;
return Cond;
+ }
}
return SDValue();
}
@@ -2306,14 +2311,18 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
+ // BRCOND i1 %C, %BB
// =>
- // Use "i(WaveSize) %VCMP value in VCC register ne/eq zero" as the branch
- // condition.
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+ // VCC = COPY i(WaveSize) %VCMP
+ // S_CBRANCH_VCCNZ/VCCZ %BB
Negate = CC == ISD::SETEQ;
auto VCMP = Cond->getOperand(0);
- if (auto BallotCond = combineBallotPattern(VCMP)) {
+ bool NegatedBallot = false;
+ if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
Cond = BallotCond;
UseSCCBr = !BallotCond->isDivergent();
+ Negate = Negate ^ NegatedBallot;
} else {
// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
// selected as V_CMP, but this may change for uniform condition.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c800416157b06b2..2dce92daf07283d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10499,9 +10499,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
return SDValue();
}
-// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cndmask_b32 to be deserialized.
-bool isBoolSGPR(SDValue V) {
+bool llvm::isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d717e12d29a514a..746a88c5ea13a30 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -586,6 +586,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
getTargetMMOFlags(const Instruction &I) const override;
};
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
+bool isBoolSGPR(SDValue V);
+
} // End namespace llvm
#endif
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 951221cb342a098..ac8521ab4fd55d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -396,3 +396,115 @@ true:
false:
ret i32 33
}
+
+declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB20_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB20_3
+; CHECK-NEXT: .LBB20_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB20_3
+; CHECK-NEXT: .LBB20_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, -1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB21_3
+; CHECK-NEXT: .LBB21_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB21_3
+; CHECK-NEXT: .LBB21_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_ne_zero = icmp ne i32 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB22_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB22_3
+; CHECK-NEXT: .LBB22_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB22_3
+; CHECK-NEXT: .LBB22_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b32 s1, -1, 0
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT: s_cbranch_scc1 .LBB23_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB23_3
+; CHECK-NEXT: .LBB23_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB23_3
+; CHECK-NEXT: .LBB23_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_eq_zero = icmp eq i32 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index b9dafcb9a1ca9a2..4611c5385f59fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -399,3 +399,115 @@ true:
false:
ret i32 33
}
+
+declare i64 @llvm.amdgcn.icmp.i64(i1, i1, i32)
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB0_3
+; CHECK-NEXT: .LBB0_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB0_3
+; CHECK-NEXT: .LBB0_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_2
+; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB1_3
+; CHECK-NEXT: .LBB1_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB1_3
+; CHECK-NEXT: .LBB1_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_ne_zero = icmp ne i64 %ballot, 0
+ br i1 %ballot_ne_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB2_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB2_3
+; CHECK-NEXT: .LBB2_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB2_3
+; CHECK-NEXT: .LBB2_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_cbranch_scc1 .LBB3_2
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB3_3
+; CHECK-NEXT: .LBB3_2: ; %true
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_branch .LBB3_3
+; CHECK-NEXT: .LBB3_3:
+ %v1c = icmp ult i32 %v1, 12
+ %v2c = icmp ugt i32 %v2, 34
+ %c = and i1 %v1c, %v2c
+ %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+ %ballot_eq_zero = icmp eq i64 %ballot, 0
+ br i1 %ballot_eq_zero, label %true, label %false
+true:
+ ret i32 42
+false:
+ ret i32 33
+}
\ No newline at end of file
>From 73ba94def7d9b8cdc13a43bddd4a9d861a31ea2c Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Wed, 25 Oct 2023 17:26:55 +0200
Subject: [PATCH 6/6] update test after merge
---
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 40 +++++++++----------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 4611c5385f59fc5..217f930a643da76 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -408,14 +408,14 @@ define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
-; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: s_cbranch_vccnz .LBB20_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB0_3
-; CHECK-NEXT: .LBB0_2: ; %false
+; CHECK-NEXT: s_branch .LBB20_3
+; CHECK-NEXT: .LBB20_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB0_3
-; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: s_branch .LBB20_3
+; CHECK-NEXT: .LBB20_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -437,14 +437,14 @@ define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 in
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_2
+; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2: ; %false
+; CHECK-NEXT: s_branch .LBB21_3
+; CHECK-NEXT: .LBB21_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: s_branch .LBB21_3
+; CHECK-NEXT: .LBB21_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -463,14 +463,14 @@ define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
-; CHECK-NEXT: s_cbranch_vccnz .LBB2_2
+; CHECK-NEXT: s_cbranch_vccnz .LBB22_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB2_3
-; CHECK-NEXT: .LBB2_2: ; %true
+; CHECK-NEXT: s_branch .LBB22_3
+; CHECK-NEXT: .LBB22_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB2_3
-; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT: s_branch .LBB22_3
+; CHECK-NEXT: .LBB22_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -492,14 +492,14 @@ define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 in
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc1 .LBB3_2
+; CHECK-NEXT: s_cbranch_scc1 .LBB23_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB3_3
-; CHECK-NEXT: .LBB3_2: ; %true
+; CHECK-NEXT: s_branch .LBB23_3
+; CHECK-NEXT: .LBB23_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB3_3
-; CHECK-NEXT: .LBB3_3:
+; CHECK-NEXT: s_branch .LBB23_3
+; CHECK-NEXT: .LBB23_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
More information about the cfe-commits
mailing list