[llvm] [AMDGPU] (x or y) xor -1 -> x nor y (PR #130264)
Ana Mihajlovic via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 02:11:11 PDT 2025
https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/130264
>From 9688df280b7ddff5d99b530d9fbfd6bc3bfeff83 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 7 Mar 2025 10:16:54 +0100
Subject: [PATCH 1/4] [isel] (x or y) xor -1 -> x nor y
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 14 +++++
llvm/test/CodeGen/AMDGPU/isel-nor-32.ll | 68 +++++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/isel-nor-64.ll | 68 +++++++++++++++++++++++
3 files changed, 150 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index eeac9c1ad1084..73f4655f735a2 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1925,6 +1925,20 @@ def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
+let WaveSizePredicate = isWave32 in {
+def : GCNPat<
+ (i1 (not (or_oneuse i1:$src0, i1:$src1))),
+ (S_NOR_B32 i1:$src0, i1:$src1)
+>;
+}
+
+let WaveSizePredicate = isWave64 in {
+def : GCNPat<
+ (i1 (not (or_oneuse i1:$src0, i1:$src1))),
+ (S_NOR_B64 i1:$src0, i1:$src1)
+>;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
new file mode 100644
index 0000000000000..983a335568cc1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
+
+define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: divergent_i1_phi_if_else:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_cmp_le_u32_e64 s0, v3, v4
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: s_mov_b32 s2, s0
+; CHECK-NEXT: s_and_saveexec_b32 s1, s0
+; CHECK-NEXT: ; %bb.1: ; %C
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; CHECK-NEXT: s_and_not1_b32 s2, s0, exec_lo
+; CHECK-NEXT: s_and_b32 s3, vcc_lo, exec_lo
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_or_b32 s2, s2, s3
+; CHECK-NEXT: ; %bb.2: ; %MergeCF
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: s_nor_b32 s1, s0, s2
+; CHECK-NEXT: ; implicit-def: $sgpr0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_and_saveexec_b32 s2, s1
+; CHECK-NEXT: s_xor_b32 s1, exec_lo, s2
+; CHECK-NEXT: ; %bb.3: ; %B
+; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
+; CHECK-NEXT: ; implicit-def: $vgpr2
+; CHECK-NEXT: ; %bb.4: ; %Flow
+; CHECK-NEXT: s_and_not1_saveexec_b32 s1, s1
+; CHECK-NEXT: ; %bb.5: ; %A
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_and_not1_b32 s0, s0, exec_lo
+; CHECK-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; CHECK-NEXT: s_or_b32 s0, s0, s2
+; CHECK-NEXT: ; %bb.6: ; %exit
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
+; CHECK-NEXT: global_store_b32 v[0:1], v2, off
+; CHECK-NEXT: s_endpgm
+entry:
+ %x = icmp ule i32 %a, %b
+ br i1 %x, label %C, label %MergeCF
+
+C:
+ %y = icmp eq i32 %a, %c
+ br label %MergeCF
+
+MergeCF:
+ %z = phi i1 [ %x, %entry ], [ %y, %C ]
+ %w = icmp ule i32 %a, %b
+ %cmp = or i1 %w, %z
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %sel = select i1 %phi, i32 1, i32 2
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
new file mode 100644
index 0000000000000..88e0a05556b78
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
+
+define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: divergent_i1_phi_if_else:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], v3, v4
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1]
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; CHECK-NEXT: ; %bb.1: ; %C
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; CHECK-NEXT: s_and_not1_b64 s[4:5], s[0:1], exec
+; CHECK-NEXT: s_and_b64 s[6:7], vcc, exec
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; CHECK-NEXT: ; %bb.2: ; %MergeCF
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: s_nor_b64 s[2:3], s[0:1], s[4:5]
+; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
+; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
+; CHECK-NEXT: ; %bb.3: ; %B
+; CHECK-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v2
+; CHECK-NEXT: ; implicit-def: $vgpr2
+; CHECK-NEXT: ; %bb.4: ; %Flow
+; CHECK-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT: ; %bb.5: ; %A
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
+; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT: ; %bb.6: ; %exit
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 2, 1, s[0:1]
+; CHECK-NEXT: global_store_b32 v[0:1], v2, off
+; CHECK-NEXT: s_endpgm
+entry:
+ %x = icmp ule i32 %a, %b
+ br i1 %x, label %C, label %MergeCF
+
+C:
+ %y = icmp eq i32 %a, %c
+ br label %MergeCF
+
+MergeCF:
+ %z = phi i1 [ %x, %entry ], [ %y, %C ]
+ %w = icmp ule i32 %a, %b
+ %cmp = or i1 %w, %z
+ br i1 %cmp, label %A, label %B
+
+A:
+ %val_A = icmp uge i32 %tid, 1
+ br label %exit
+
+B:
+ %val_B = icmp ult i32 %tid, 2
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+ %sel = select i1 %phi, i32 1, i32 2
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
>From 848d09eb869740296ee2bd1e2f8b90b3684d05d4 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Tue, 11 Mar 2025 14:04:21 +0100
Subject: [PATCH 2/4] update tests
---
llvm/test/CodeGen/AMDGPU/isel-nor-32.ll | 171 +++++++++++++++---------
llvm/test/CodeGen/AMDGPU/isel-nor-64.ll | 162 +++++++++++++---------
2 files changed, 210 insertions(+), 123 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
index 983a335568cc1..a0e48a35d6c97 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
@@ -1,68 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GFX11W32 %s
-define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %a, i32 %b, i32 %c, i32 %d) {
-; CHECK-LABEL: divergent_i1_phi_if_else:
-; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: v_cmp_le_u32_e64 s0, v3, v4
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: s_mov_b32 s2, s0
-; CHECK-NEXT: s_and_saveexec_b32 s1, s0
-; CHECK-NEXT: ; %bb.1: ; %C
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; CHECK-NEXT: s_and_not1_b32 s2, s0, exec_lo
-; CHECK-NEXT: s_and_b32 s3, vcc_lo, exec_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s2, s2, s3
-; CHECK-NEXT: ; %bb.2: ; %MergeCF
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; CHECK-NEXT: s_nor_b32 s1, s0, s2
-; CHECK-NEXT: ; implicit-def: $sgpr0
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_saveexec_b32 s2, s1
-; CHECK-NEXT: s_xor_b32 s1, exec_lo, s2
-; CHECK-NEXT: ; %bb.3: ; %B
-; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
-; CHECK-NEXT: ; implicit-def: $vgpr2
-; CHECK-NEXT: ; %bb.4: ; %Flow
-; CHECK-NEXT: s_and_not1_saveexec_b32 s1, s1
-; CHECK-NEXT: ; %bb.5: ; %A
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 s0, s0, exec_lo
-; CHECK-NEXT: s_and_b32 s2, vcc_lo, exec_lo
-; CHECK-NEXT: s_or_b32 s0, s0, s2
-; CHECK-NEXT: ; %bb.6: ; %exit
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0
-; CHECK-NEXT: global_store_b32 v[0:1], v2, off
-; CHECK-NEXT: s_endpgm
-entry:
- %x = icmp ule i32 %a, %b
- br i1 %x, label %C, label %MergeCF
-
-C:
- %y = icmp eq i32 %a, %c
- br label %MergeCF
-
-MergeCF:
- %z = phi i1 [ %x, %entry ], [ %y, %C ]
- %w = icmp ule i32 %a, %b
- %cmp = or i1 %w, %z
- br i1 %cmp, label %A, label %B
-
-A:
- %val_A = icmp uge i32 %tid, 1
- br label %exit
+define amdgpu_ps i32 @test_w32(i32 %x, i32 %y) {
+; GFX12W32-LABEL: test_w32:
+; GFX12W32: ; %bb.0:
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_nor_b32 s0, s0, s1
+; GFX12W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; GFX12W32-NEXT: s_wait_alu 0xf1ff
+; GFX12W32-NEXT: ; return to shader part epilog
+;
+; GFX11W32-LABEL: test_w32:
+; GFX11W32: ; %bb.0:
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: s_nor_b32 s0, s0, s1
+; GFX11W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; GFX11W32-NEXT: ; return to shader part epilog
+ %x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
+ %y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
+ %t = or i1 %x.b, %y.b
+ %t.1 = xor i1 %t, -1
+ %z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
+ ret i32 %z
+}
-B:
- %val_B = icmp ult i32 %tid, 2
- br label %exit
+define amdgpu_ps i32 @negative_test_w32(i32 %x, i32 %y) {
+; GFX12W32-LABEL: negative_test_w32:
+; GFX12W32: ; %bb.0:
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_or_b32 s0, s0, s1
+; GFX12W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12W32-NEXT: s_xor_b32 s0, s0, -1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
+; GFX12W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX12W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12W32-NEXT: s_add_co_i32 s0, s0, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
+; GFX12W32-NEXT: ; return to shader part epilog
+;
+; GFX11W32-LABEL: negative_test_w32:
+; GFX11W32: ; %bb.0:
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: s_or_b32 s0, s0, s1
+; GFX11W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11W32-NEXT: s_xor_b32 s0, s0, -1
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11W32-NEXT: s_add_i32 s0, s0, vcc_lo
+; GFX11W32-NEXT: ; return to shader part epilog
+ %x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
+ %y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
+ %t = or i1 %x.b, %y.b
+ %t.1 = xor i1 %t, -1
+ %p.1 = xor i1 %t, -4
+ %z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
+ %q = call i32 @llvm.amdgcn.ballot.i32(i1 %p.1)
+ %r = add i32 %z, %q
+ ret i32 %r
+}
-exit:
- %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
- %sel = select i1 %phi, i32 1, i32 2
- store i32 %sel, ptr addrspace(1) %out
+define amdgpu_ps void @test_vgpr_w32(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
+; GFX12W32-LABEL: test_vgpr_w32:
+; GFX12W32: ; %bb.0:
+; GFX12W32-NEXT: v_or_b32_e32 v3, v3, v7
+; GFX12W32-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX12W32-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX12W32-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W32-NEXT: v_not_b32_e32 v3, v3
+; GFX12W32-NEXT: v_not_b32_e32 v2, v2
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W32-NEXT: v_not_b32_e32 v1, v1
+; GFX12W32-NEXT: v_not_b32_e32 v0, v0
+; GFX12W32-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX12W32-NEXT: s_endpgm
+;
+; GFX11W32-LABEL: test_vgpr_w32:
+; GFX11W32: ; %bb.0:
+; GFX11W32-NEXT: v_or_b32_e32 v3, v3, v7
+; GFX11W32-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX11W32-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX11W32-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W32-NEXT: v_not_b32_e32 v3, v3
+; GFX11W32-NEXT: v_not_b32_e32 v2, v2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W32-NEXT: v_not_b32_e32 v1, v1
+; GFX11W32-NEXT: v_not_b32_e32 v0, v0
+; GFX11W32-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX11W32-NEXT: s_endpgm
+ %p = or <4 x i32> %x, %y
+ %q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
+ store <4 x i32> %q, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
index 88e0a05556b78..d533e07aca050 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
@@ -1,68 +1,108 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX11W64 %s
-define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %a, i32 %b, i32 %c, i32 %d) {
-; CHECK-LABEL: divergent_i1_phi_if_else:
-; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], v3, v4
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1]
-; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; CHECK-NEXT: ; %bb.1: ; %C
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; CHECK-NEXT: s_and_not1_b64 s[4:5], s[0:1], exec
-; CHECK-NEXT: s_and_b64 s[6:7], vcc, exec
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; CHECK-NEXT: ; %bb.2: ; %MergeCF
-; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
-; CHECK-NEXT: s_nor_b64 s[2:3], s[0:1], s[4:5]
-; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
-; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
-; CHECK-NEXT: ; %bb.3: ; %B
-; CHECK-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v2
-; CHECK-NEXT: ; implicit-def: $vgpr2
-; CHECK-NEXT: ; %bb.4: ; %Flow
-; CHECK-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3]
-; CHECK-NEXT: ; %bb.5: ; %A
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
-; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; CHECK-NEXT: ; %bb.6: ; %exit
-; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 2, 1, s[0:1]
-; CHECK-NEXT: global_store_b32 v[0:1], v2, off
-; CHECK-NEXT: s_endpgm
-entry:
- %x = icmp ule i32 %a, %b
- br i1 %x, label %C, label %MergeCF
-C:
- %y = icmp eq i32 %a, %c
- br label %MergeCF
-
-MergeCF:
- %z = phi i1 [ %x, %entry ], [ %y, %C ]
- %w = icmp ule i32 %a, %b
- %cmp = or i1 %w, %z
- br i1 %cmp, label %A, label %B
-
-A:
- %val_A = icmp uge i32 %tid, 1
- br label %exit
+define amdgpu_ps i64 @test_w64(i64 inreg %x, i64 inreg %y) {
+; GFX12W64-LABEL: test_w64:
+; GFX12W64: ; %bb.0:
+; GFX12W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX12W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: ; return to shader part epilog
+;
+; GFX11W64-LABEL: test_w64:
+; GFX11W64: ; %bb.0:
+; GFX11W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX11W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX11W64-NEXT: ; return to shader part epilog
+ %x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
+ %y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
+ %t = or i1 %x.b, %y.b
+ %t.1 = xor i1 %t, -1
+ %z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
+ ret i64 %z
+}
-B:
- %val_B = icmp ult i32 %tid, 2
- br label %exit
+define amdgpu_ps i64 @negative_test_w64(i64 inreg %x, i64 inreg %y, ptr addrspace(1) %out) {
+; GFX12W64-LABEL: negative_test_w64:
+; GFX12W64: ; %bb.0:
+; GFX12W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX12W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
+; GFX12W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX12W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX12W64-NEXT: s_add_nc_u64 s[0:1], s[0:1], vcc
+; GFX12W64-NEXT: s_wait_alu 0xfffe
+; GFX12W64-NEXT: ; return to shader part epilog
+;
+; GFX11W64-LABEL: negative_test_w64:
+; GFX11W64: ; %bb.0:
+; GFX11W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX11W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; GFX11W64-NEXT: s_waitcnt_depctr 0xfffe
+; GFX11W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX11W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX11W64-NEXT: s_add_u32 s0, vcc_lo, s0
+; GFX11W64-NEXT: s_waitcnt_depctr 0xfffe
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT: s_addc_u32 s1, vcc_hi, s1
+; GFX11W64-NEXT: ; return to shader part epilog
+ %x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
+ %y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
+ %t = or i1 %x.b, %y.b
+ %t.1 = xor i1 %t, -1
+ %p.1 = xor i1 %t, -4
+ %z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
+ %q = call i64 @llvm.amdgcn.ballot.i64(i1 %p.1)
+ %r = add i64 %z, %q
+ ret i64 %r
+}
-exit:
- %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
- %sel = select i1 %phi, i32 1, i32 2
- store i32 %sel, ptr addrspace(1) %out
+define amdgpu_ps void @test_vgpr_w64(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
+; GFX12W64-LABEL: test_vgpr_w64:
+; GFX12W64: ; %bb.0:
+; GFX12W64-NEXT: v_or_b32_e32 v3, v3, v7
+; GFX12W64-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX12W64-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX12W64-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W64-NEXT: v_not_b32_e32 v3, v3
+; GFX12W64-NEXT: v_not_b32_e32 v2, v2
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12W64-NEXT: v_not_b32_e32 v1, v1
+; GFX12W64-NEXT: v_not_b32_e32 v0, v0
+; GFX12W64-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX12W64-NEXT: s_endpgm
+;
+; GFX11W64-LABEL: test_vgpr_w64:
+; GFX11W64: ; %bb.0:
+; GFX11W64-NEXT: v_or_b32_e32 v3, v3, v7
+; GFX11W64-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX11W64-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX11W64-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W64-NEXT: v_not_b32_e32 v3, v3
+; GFX11W64-NEXT: v_not_b32_e32 v2, v2
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11W64-NEXT: v_not_b32_e32 v1, v1
+; GFX11W64-NEXT: v_not_b32_e32 v0, v0
+; GFX11W64-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX11W64-NEXT: s_endpgm
+ %p = or <4 x i32> %x, %y
+ %q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
+ store <4 x i32> %q, ptr addrspace(1) %out
ret void
}
>From 776168b5880644bab1f3aa702a067e52bd916c9d Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Wed, 12 Mar 2025 17:32:10 +0100
Subject: [PATCH 3/4] update ballot test, update nor test
---
llvm/test/CodeGen/AMDGPU/isel-nor-32.ll | 115 ------------------------
llvm/test/CodeGen/AMDGPU/isel-nor-64.ll | 108 ----------------------
llvm/test/CodeGen/AMDGPU/nor-32.ll | 62 +++++++++++++
llvm/test/CodeGen/AMDGPU/nor-64.ll | 63 +++++++++++++
llvm/test/CodeGen/AMDGPU/nor.ll | 83 ++++++++++++++++-
5 files changed, 204 insertions(+), 227 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/nor-32.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/nor-64.ll
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
deleted file mode 100644
index a0e48a35d6c97..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/isel-nor-32.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GFX11W32 %s
-
-define amdgpu_ps i32 @test_w32(i32 %x, i32 %y) {
-; GFX12W32-LABEL: test_w32:
-; GFX12W32: ; %bb.0:
-; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12W32-NEXT: v_readfirstlane_b32 s1, v1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W32-NEXT: s_nor_b32 s0, s0, s1
-; GFX12W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
-; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: ; return to shader part epilog
-;
-; GFX11W32-LABEL: test_w32:
-; GFX11W32: ; %bb.0:
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11W32-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT: s_nor_b32 s0, s0, s1
-; GFX11W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
-; GFX11W32-NEXT: ; return to shader part epilog
- %x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
- %y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
- %t = or i1 %x.b, %y.b
- %t.1 = xor i1 %t, -1
- %z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
- ret i32 %z
-}
-
-define amdgpu_ps i32 @negative_test_w32(i32 %x, i32 %y) {
-; GFX12W32-LABEL: negative_test_w32:
-; GFX12W32: ; %bb.0:
-; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12W32-NEXT: v_readfirstlane_b32 s1, v1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W32-NEXT: s_or_b32 s0, s0, s1
-; GFX12W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12W32-NEXT: s_xor_b32 s0, s0, -1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX12W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: s_add_co_i32 s0, s0, vcc_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: ; return to shader part epilog
-;
-; GFX11W32-LABEL: negative_test_w32:
-; GFX11W32: ; %bb.0:
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11W32-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT: s_or_b32 s0, s0, s1
-; GFX11W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11W32-NEXT: s_xor_b32 s0, s0, -1
-; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11W32-NEXT: s_add_i32 s0, s0, vcc_lo
-; GFX11W32-NEXT: ; return to shader part epilog
- %x.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %x)
- %y.b = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %y)
- %t = or i1 %x.b, %y.b
- %t.1 = xor i1 %t, -1
- %p.1 = xor i1 %t, -4
- %z = call i32 @llvm.amdgcn.ballot.i32(i1 %t.1)
- %q = call i32 @llvm.amdgcn.ballot.i32(i1 %p.1)
- %r = add i32 %z, %q
- ret i32 %r
-}
-
-define amdgpu_ps void @test_vgpr_w32(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
-; GFX12W32-LABEL: test_vgpr_w32:
-; GFX12W32: ; %bb.0:
-; GFX12W32-NEXT: v_or_b32_e32 v3, v3, v7
-; GFX12W32-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX12W32-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX12W32-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12W32-NEXT: v_not_b32_e32 v3, v3
-; GFX12W32-NEXT: v_not_b32_e32 v2, v2
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12W32-NEXT: v_not_b32_e32 v1, v1
-; GFX12W32-NEXT: v_not_b32_e32 v0, v0
-; GFX12W32-NEXT: global_store_b128 v[8:9], v[0:3], off
-; GFX12W32-NEXT: s_endpgm
-;
-; GFX11W32-LABEL: test_vgpr_w32:
-; GFX11W32: ; %bb.0:
-; GFX11W32-NEXT: v_or_b32_e32 v3, v3, v7
-; GFX11W32-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX11W32-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11W32-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11W32-NEXT: v_not_b32_e32 v3, v3
-; GFX11W32-NEXT: v_not_b32_e32 v2, v2
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11W32-NEXT: v_not_b32_e32 v1, v1
-; GFX11W32-NEXT: v_not_b32_e32 v0, v0
-; GFX11W32-NEXT: global_store_b128 v[8:9], v[0:3], off
-; GFX11W32-NEXT: s_endpgm
- %p = or <4 x i32> %x, %y
- %q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
- store <4 x i32> %q, ptr addrspace(1) %out
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll b/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
deleted file mode 100644
index d533e07aca050..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/isel-nor-64.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX12W64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefix=GFX11W64 %s
-
-
-define amdgpu_ps i64 @test_w64(i64 inreg %x, i64 inreg %y) {
-; GFX12W64-LABEL: test_w64:
-; GFX12W64: ; %bb.0:
-; GFX12W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX12W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
-; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: ; return to shader part epilog
-;
-; GFX11W64-LABEL: test_w64:
-; GFX11W64: ; %bb.0:
-; GFX11W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX11W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
-; GFX11W64-NEXT: ; return to shader part epilog
- %x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
- %y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
- %t = or i1 %x.b, %y.b
- %t.1 = xor i1 %t, -1
- %z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
- ret i64 %z
-}
-
-define amdgpu_ps i64 @negative_test_w64(i64 inreg %x, i64 inreg %y, ptr addrspace(1) %out) {
-; GFX12W64-LABEL: negative_test_w64:
-; GFX12W64: ; %bb.0:
-; GFX12W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX12W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
-; GFX12W64-NEXT: s_wait_alu 0xfffe
-; GFX12W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX12W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX12W64-NEXT: s_add_nc_u64 s[0:1], s[0:1], vcc
-; GFX12W64-NEXT: s_wait_alu 0xfffe
-; GFX12W64-NEXT: ; return to shader part epilog
-;
-; GFX11W64-LABEL: negative_test_w64:
-; GFX11W64: ; %bb.0:
-; GFX11W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX11W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
-; GFX11W64-NEXT: s_waitcnt_depctr 0xfffe
-; GFX11W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX11W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX11W64-NEXT: s_add_u32 s0, vcc_lo, s0
-; GFX11W64-NEXT: s_waitcnt_depctr 0xfffe
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: s_addc_u32 s1, vcc_hi, s1
-; GFX11W64-NEXT: ; return to shader part epilog
- %x.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %x)
- %y.b = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %y)
- %t = or i1 %x.b, %y.b
- %t.1 = xor i1 %t, -1
- %p.1 = xor i1 %t, -4
- %z = call i64 @llvm.amdgcn.ballot.i64(i1 %t.1)
- %q = call i64 @llvm.amdgcn.ballot.i64(i1 %p.1)
- %r = add i64 %z, %q
- ret i64 %r
-}
-
-define amdgpu_ps void @test_vgpr_w64(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %out) {
-; GFX12W64-LABEL: test_vgpr_w64:
-; GFX12W64: ; %bb.0:
-; GFX12W64-NEXT: v_or_b32_e32 v3, v3, v7
-; GFX12W64-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX12W64-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX12W64-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12W64-NEXT: v_not_b32_e32 v3, v3
-; GFX12W64-NEXT: v_not_b32_e32 v2, v2
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12W64-NEXT: v_not_b32_e32 v1, v1
-; GFX12W64-NEXT: v_not_b32_e32 v0, v0
-; GFX12W64-NEXT: global_store_b128 v[8:9], v[0:3], off
-; GFX12W64-NEXT: s_endpgm
-;
-; GFX11W64-LABEL: test_vgpr_w64:
-; GFX11W64: ; %bb.0:
-; GFX11W64-NEXT: v_or_b32_e32 v3, v3, v7
-; GFX11W64-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX11W64-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11W64-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11W64-NEXT: v_not_b32_e32 v3, v3
-; GFX11W64-NEXT: v_not_b32_e32 v2, v2
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11W64-NEXT: v_not_b32_e32 v1, v1
-; GFX11W64-NEXT: v_not_b32_e32 v0, v0
-; GFX11W64-NEXT: global_store_b128 v[8:9], v[0:3], off
-; GFX11W64-NEXT: s_endpgm
- %p = or <4 x i32> %x, %y
- %q = xor <4 x i32> %p, <i32 -1, i32 -1, i32 -1, i32 -1>
- store <4 x i32> %q, ptr addrspace(1) %out
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/nor-32.ll b/llvm/test/CodeGen/AMDGPU/nor-32.ll
new file mode 100644
index 0000000000000..208b34aeea25c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nor-32.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL %s
+
+; Use ballot for easy access to lanemask
+
+define amdgpu_ps i32 @test_nor(i32 inreg %a, i32 inreg %b) {
+; SDAG-LABEL: test_nor:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_nor_b32 s0, s0, s1
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; SDAG-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: test_nor:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_nor_b32 s0, s0, s1
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, exec_lo
+; GISEL-NEXT: ; return to shader part epilog
+ %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %a)
+ %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %b)
+ %or = or i1 %a.lanemask, %b.lanemask
+ %xor = xor i1 %or, true
+ %r = call i32 @llvm.amdgcn.ballot.i32(i1 %xor)
+ ret i32 %r
+}
+
+define amdgpu_ps i32 @test_or_two_uses(i32 inreg %a, i32 inreg %b) {
+; SDAG-LABEL: test_or_two_uses:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_or_b32 s0, s0, s1
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; SDAG-NEXT: s_xor_b32 s0, s0, -1
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: test_or_two_uses:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_or_b32 s0, s0, s1
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_xor_b32 s1, s0, -1
+; GISEL-NEXT: s_and_b32 s0, s0, exec_lo
+; GISEL-NEXT: s_and_b32 s1, s1, exec_lo
+; GISEL-NEXT: s_and_b32 s0, s1, s0
+; GISEL-NEXT: ; return to shader part epilog
+ %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %a)
+ %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %b)
+ %or = or i1 %a.lanemask, %b.lanemask
+ %xor = xor i1 %or, true
+ %r0 = call i32 @llvm.amdgcn.ballot.i32(i1 %xor)
+ %r1 = call i32 @llvm.amdgcn.ballot.i32(i1 %or)
+ %r = and i32 %r0, %r1
+ ret i32 %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/nor-64.ll b/llvm/test/CodeGen/AMDGPU/nor-64.ll
new file mode 100644
index 0000000000000..a55c136f2e434
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nor-64.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL %s
+
+; Use ballot for easy access to lanemask
+
+define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) {
+; SDAG-LABEL: test_nor:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: test_nor:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GISEL-NEXT: ; return to shader part epilog
+ %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
+ %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
+ %or = or i1 %a.lanemask, %b.lanemask
+ %xor = xor i1 %or, true
+ %r = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) {
+; SDAG-LABEL: test_or_two_uses:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SDAG-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; SDAG-NEXT: s_waitcnt_depctr 0xfffe
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
+; SDAG-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; SDAG-NEXT: s_waitcnt_depctr 0xfffe
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: test_or_two_uses:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_xor_b64 s[2:3], s[0:1], -1
+; GISEL-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GISEL-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; GISEL-NEXT: ; return to shader part epilog
+ %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
+ %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
+ %or = or i1 %a.lanemask, %b.lanemask
+ %xor = xor i1 %or, true
+ %r0 = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
+ %r1 = call i64 @llvm.amdgcn.ballot.i64(i1 %or)
+ %r = and i64 %r0, %r1
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/nor.ll b/llvm/test/CodeGen/AMDGPU/nor.ll
index 530a6e0145e82..059c27e89f743 100644
--- a/llvm/test/CodeGen/AMDGPU/nor.ll
+++ b/llvm/test/CodeGen/AMDGPU/nor.ll
@@ -1,7 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=W32,GCN %s
; GCN-LABEL: {{^}}scalar_nor_i32_one_use
; GCN: s_nor_b32
@@ -81,3 +82,77 @@ entry:
%r = xor i64 %or, -1
ret i64 %r
}
+
+; GCN-LABEL: {{^}}test_nor_in_control_flow
+
+; W32-NOT: s_nor_b64
+; W32: s_nor_b32
+
+; W64-NOT: s_nor_b32
+; W64: s_nor_b64
+define amdgpu_ps void @test_nor_in_control_flow(ptr addrspace(1) %out, i32 %a) {
+
+entry:
+ %x = icmp ule i32 %a, 0
+ br i1 %x, label %If2, label %MergeCF
+
+If2:
+ %y = icmp ule i32 %a, 1
+ br label %MergeCF
+
+MergeCF:
+ %z = phi i1 [ %x, %entry ], [ %y, %If2 ]
+ %or = or i1 %x, %z
+ br i1 %or, label %If, label %Else
+
+If:
+ %val_A = icmp uge i32 %a, 3
+ br label %exit
+
+Else:
+ %val_B = icmp ult i32 %a, 4
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %If ], [ %val_B, %Else ]
+ store i1 %phi, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_or_two_uses
+; GCN-NOT: s_nor_b64
+; GCN-NOT: s_nor_b32
+
+; W32: s_or_b32
+; W32: s_xor_b32
+
+; W64: s_or_b64
+; W64: s_xor_b64
+define amdgpu_ps void @test_or_two_uses(ptr addrspace(1) %out, i32 %a) {
+entry:
+ %x = icmp ule i32 %a, 0
+ br i1 %x, label %If2, label %MergeCF
+
+If2:
+ %y = icmp ule i32 %a, 1
+ br label %MergeCF
+
+MergeCF:
+ %z = phi i1 [ %x, %entry ], [ %y, %If2 ]
+ %or = or i1 %x, %z
+ br i1 %or, label %If, label %Else
+
+If:
+ %val_A = icmp uge i32 %a, 1
+ br label %exit
+
+Else:
+ %val_B = icmp ult i32 %a, 4
+ br label %exit
+
+exit:
+ %phi = phi i1 [ %val_A, %If ], [ %val_B, %Else ]
+ %or2 = or i1 %phi, %or
+ store i1 %or2, ptr addrspace(1) %out
+ ret void
+}
>From 8d942ae157f2b7efb56cff549ef9612f42518658 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Tue, 25 Mar 2025 10:10:43 +0100
Subject: [PATCH 4/4] cleaner ballot test
---
llvm/test/CodeGen/AMDGPU/nor-32.ll | 62 ----------
llvm/test/CodeGen/AMDGPU/nor-64.ll | 63 -----------
.../CodeGen/AMDGPU/nor-divergent-lanemask.ll | 107 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/nor.ll | 85 +-------------
4 files changed, 112 insertions(+), 205 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/nor-32.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/nor-64.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
diff --git a/llvm/test/CodeGen/AMDGPU/nor-32.ll b/llvm/test/CodeGen/AMDGPU/nor-32.ll
deleted file mode 100644
index 208b34aeea25c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/nor-32.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL %s
-
-; Use ballot for easy access to lanemask
-
-define amdgpu_ps i32 @test_nor(i32 inreg %a, i32 inreg %b) {
-; SDAG-LABEL: test_nor:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_nor_b32 s0, s0, s1
-; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; SDAG-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: test_nor:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_nor_b32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: s_and_b32 s0, s0, exec_lo
-; GISEL-NEXT: ; return to shader part epilog
- %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %a)
- %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %b)
- %or = or i1 %a.lanemask, %b.lanemask
- %xor = xor i1 %or, true
- %r = call i32 @llvm.amdgcn.ballot.i32(i1 %xor)
- ret i32 %r
-}
-
-define amdgpu_ps i32 @test_or_two_uses(i32 inreg %a, i32 inreg %b) {
-; SDAG-LABEL: test_or_two_uses:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_or_b32 s0, s0, s1
-; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; SDAG-NEXT: s_xor_b32 s0, s0, -1
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
-; SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: test_or_two_uses:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_or_b32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_xor_b32 s1, s0, -1
-; GISEL-NEXT: s_and_b32 s0, s0, exec_lo
-; GISEL-NEXT: s_and_b32 s1, s1, exec_lo
-; GISEL-NEXT: s_and_b32 s0, s1, s0
-; GISEL-NEXT: ; return to shader part epilog
- %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %a)
- %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %b)
- %or = or i1 %a.lanemask, %b.lanemask
- %xor = xor i1 %or, true
- %r0 = call i32 @llvm.amdgcn.ballot.i32(i1 %xor)
- %r1 = call i32 @llvm.amdgcn.ballot.i32(i1 %or)
- %r = and i32 %r0, %r1
- ret i32 %r
-}
diff --git a/llvm/test/CodeGen/AMDGPU/nor-64.ll b/llvm/test/CodeGen/AMDGPU/nor-64.ll
deleted file mode 100644
index a55c136f2e434..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/nor-64.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL %s
-
-; Use ballot for easy access to lanemask
-
-define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) {
-; SDAG-LABEL: test_nor:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: test_nor:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GISEL-NEXT: ; return to shader part epilog
- %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
- %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
- %or = or i1 %a.lanemask, %b.lanemask
- %xor = xor i1 %or, true
- %r = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
- ret i64 %r
-}
-
-define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) {
-; SDAG-LABEL: test_or_two_uses:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SDAG-NEXT: s_xor_b64 s[0:1], s[0:1], -1
-; SDAG-NEXT: s_waitcnt_depctr 0xfffe
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; SDAG-NEXT: s_and_b64 s[0:1], s[0:1], vcc
-; SDAG-NEXT: s_waitcnt_depctr 0xfffe
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: test_or_two_uses:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_xor_b64 s[2:3], s[0:1], -1
-; GISEL-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GISEL-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; GISEL-NEXT: ; return to shader part epilog
- %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
- %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
- %or = or i1 %a.lanemask, %b.lanemask
- %xor = xor i1 %or, true
- %r0 = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
- %r1 = call i64 @llvm.amdgcn.ballot.i64(i1 %or)
- %r = and i64 %r0, %r1
- ret i64 %r
-}
diff --git a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
new file mode 100644
index 0000000000000..c11945f808ccd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG-W64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG-W32 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL-W32 %s
+
+; Use ballot for easy access to lanemask
+
+define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) {
+; SDAG-W64-LABEL: test_nor:
+; SDAG-W64: ; %bb.0:
+; SDAG-W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; SDAG-W64-NEXT: ; return to shader part epilog
+;
+; GISEL-W64-LABEL: test_nor:
+; GISEL-W64: ; %bb.0:
+; GISEL-W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; GISEL-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-W64-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GISEL-W64-NEXT: ; return to shader part epilog
+;
+; SDAG-W32-LABEL: test_nor:
+; SDAG-W32: ; %bb.0:
+; SDAG-W32-NEXT: s_nor_b32 s0, s0, s2
+; SDAG-W32-NEXT: s_mov_b32 s1, 0
+; SDAG-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; SDAG-W32-NEXT: ; return to shader part epilog
+;
+; GISEL-W32-LABEL: test_nor:
+; GISEL-W32: ; %bb.0:
+; GISEL-W32-NEXT: s_nor_b32 s0, s0, s2
+; GISEL-W32-NEXT: s_mov_b32 s1, 0
+; GISEL-W32-NEXT: s_and_b32 s0, s0, exec_lo
+; GISEL-W32-NEXT: ; return to shader part epilog
+ %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
+ %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
+ %or = or i1 %a.lanemask, %b.lanemask
+ %xor = xor i1 %or, true
+ %r = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
+ ret i64 %r
+}
+
+define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) {
+; SDAG-W64-LABEL: test_or_two_uses:
+; SDAG-W64: ; %bb.0:
+; SDAG-W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SDAG-W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe
+; SDAG-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
+; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe
+; SDAG-W64-NEXT: ; return to shader part epilog
+;
+; GISEL-W64-LABEL: test_or_two_uses:
+; GISEL-W64: ; %bb.0:
+; GISEL-W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GISEL-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GISEL-W64-NEXT: s_xor_b64 s[2:3], s[0:1], -1
+; GISEL-W64-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GISEL-W64-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GISEL-W64-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; GISEL-W64-NEXT: ; return to shader part epilog
+;
+; SDAG-W32-LABEL: test_or_two_uses:
+; SDAG-W32: ; %bb.0:
+; SDAG-W32-NEXT: s_or_b32 s0, s0, s2
+; SDAG-W32-NEXT: s_mov_b32 s3, 0
+; SDAG-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; SDAG-W32-NEXT: s_xor_b32 s0, s0, -1
+; SDAG-W32-NEXT: s_mov_b32 s1, s3
+; SDAG-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s2, 0, v1
+; SDAG-W32-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; SDAG-W32-NEXT: ; return to shader part epilog
+;
+; GISEL-W32-LABEL: test_or_two_uses:
+; GISEL-W32: ; %bb.0:
+; GISEL-W32-NEXT: s_or_b32 s0, s0, s2
+; GISEL-W32-NEXT: s_mov_b32 s1, 0
+; GISEL-W32-NEXT: s_xor_b32 s4, s0, -1
+; GISEL-W32-NEXT: s_and_b32 s2, s0, exec_lo
+; GISEL-W32-NEXT: s_mov_b32 s3, s1
+; GISEL-W32-NEXT: s_and_b32 s0, s4, exec_lo
+; GISEL-W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-W32-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GISEL-W32-NEXT: ; return to shader part epilog
+ %a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
+ %b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
+ %or = or i1 %a.lanemask, %b.lanemask
+ %xor = xor i1 %or, true
+ %r0 = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
+ %r1 = call i64 @llvm.amdgcn.ballot.i64(i1 %or)
+ %r = and i64 %r0, %r1
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/nor.ll b/llvm/test/CodeGen/AMDGPU/nor.ll
index 059c27e89f743..8135ac75bcda7 100644
--- a/llvm/test/CodeGen/AMDGPU/nor.ll
+++ b/llvm/test/CodeGen/AMDGPU/nor.ll
@@ -1,8 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=W64,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=W32,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
; GCN-LABEL: {{^}}scalar_nor_i32_one_use
; GCN: s_nor_b32
@@ -81,78 +80,4 @@ entry:
%or = or i64 %a, %b
%r = xor i64 %or, -1
ret i64 %r
-}
-
-; GCN-LABEL: {{^}}test_nor_in_control_flow
-
-; W32-NOT: s_nor_b64
-; W32: s_nor_b32
-
-; W64-NOT: s_nor_b32
-; W64: s_nor_b64
-define amdgpu_ps void @test_nor_in_control_flow(ptr addrspace(1) %out, i32 %a) {
-
-entry:
- %x = icmp ule i32 %a, 0
- br i1 %x, label %If2, label %MergeCF
-
-If2:
- %y = icmp ule i32 %a, 1
- br label %MergeCF
-
-MergeCF:
- %z = phi i1 [ %x, %entry ], [ %y, %If2 ]
- %or = or i1 %x, %z
- br i1 %or, label %If, label %Else
-
-If:
- %val_A = icmp uge i32 %a, 3
- br label %exit
-
-Else:
- %val_B = icmp ult i32 %a, 4
- br label %exit
-
-exit:
- %phi = phi i1 [ %val_A, %If ], [ %val_B, %Else ]
- store i1 %phi, ptr addrspace(1) %out
- ret void
-}
-
-; GCN-LABEL: {{^}}test_or_two_uses
-; GCN-NOT: s_nor_b64
-; GCN-NOT: s_nor_b32
-
-; W32: s_or_b32
-; W32: s_xor_b32
-
-; W64: s_or_b64
-; W64: s_xor_b64
-define amdgpu_ps void @test_or_two_uses(ptr addrspace(1) %out, i32 %a) {
-entry:
- %x = icmp ule i32 %a, 0
- br i1 %x, label %If2, label %MergeCF
-
-If2:
- %y = icmp ule i32 %a, 1
- br label %MergeCF
-
-MergeCF:
- %z = phi i1 [ %x, %entry ], [ %y, %If2 ]
- %or = or i1 %x, %z
- br i1 %or, label %If, label %Else
-
-If:
- %val_A = icmp uge i32 %a, 1
- br label %exit
-
-Else:
- %val_B = icmp ult i32 %a, 4
- br label %exit
-
-exit:
- %phi = phi i1 [ %val_A, %If ], [ %val_B, %Else ]
- %or2 = or i1 %phi, %or
- store i1 %or2, ptr addrspace(1) %out
- ret void
-}
+}
\ No newline at end of file
More information about the llvm-commits
mailing list