[llvm] [AMDGPU] (xor (cmp_eq x, 1), -1) -> cmp_neq x, 1 (PR #133698)
Ana Mihajlovic via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 07:35:07 PDT 2025
https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/133698
>From 0a5c904e7be01e4bca7478fbc045a6bdc6a73cc2 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Mon, 31 Mar 2025 13:00:40 +0200
Subject: [PATCH 1/2] [AMDGPU] (xor (cmp_eq x, 1), -1) -> cmp_neq x, 1
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 ++
.../codegen-prepare-addrspacecast-non-null.ll | 16 +-
llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 49 ++---
...-copies-phi-block-end-iterator-debugloc.ll | 3 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 14 +-
.../identical-subrange-spill-infloop.ll | 181 +++++++++---------
.../AMDGPU/kill-true-in-return-block.ll | 5 +-
.../AMDGPU/lds-global-non-entry-func.ll | 21 +-
.../test/CodeGen/AMDGPU/loop_exit_with_xor.ll | 12 +-
...p-var-out-of-divergent-loop-swdev407790.ll | 23 ++-
.../si-annotate-nested-control-flows.ll | 63 +++---
.../si-optimize-vgpr-live-range-dbg-instr.ll | 7 +-
12 files changed, 202 insertions(+), 204 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9051db0c01ed1..f9024ae4f1cf6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3123,6 +3123,18 @@ def IMMBitSelConst : SDNodeXForm<imm, [{
// v_cmp_ne_u32_e64 $a, 0, $a
// Handle the VALU case.
+def : GCNPat <
+ (i1 (xor (i1 (DivergentUnaryFrag<trunc> i32:$a)), -1)),
+ (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1), i32:$a),
+ (i32 1))
+>;
+
+def : GCNPat <
+ (i1 (xor (i1 (DivergentUnaryFrag<trunc> i64:$a)), -1)),
+ (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1),
+ (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
+>;
+
def : GCNPat <
(i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
(V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 3216e71e6221a..861319bf560fa 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -207,22 +207,22 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; DAGISEL-ASM: ; %bb.0: ; %entry
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
-; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; DAGISEL-ASM-NEXT: ; %bb.1: ; %then
; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
-; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
-; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1
+; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0
-; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
+; DAGISEL-ASM-NEXT: s_mov_b64 s[6:7], src_private_base
; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally
; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
-; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7]
-; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
+; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, vcc
+; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s7
; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 61c0b8b861d5b..41082821bafe3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -6,12 +6,11 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr5
; GCN-NEXT: ; implicit-def: $vgpr4
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -101,11 +100,10 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB1_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -172,11 +170,10 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB2_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -249,11 +246,10 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB3_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -353,11 +349,10 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -424,11 +419,10 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB5_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -501,11 +495,10 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB6_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll
index 13184cf17a2e5..fd64ea3ae1c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll
@@ -6,8 +6,7 @@ define i32 @rocrand_regression(ptr addrspace(1) %arg, i32 %arg0, i1 %cmp7) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 1, v3
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .LBB0_1: ; %do.body
; CHECK-NEXT: ; =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 8ee52a828de65..d0a3811314029 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -102,9 +102,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CIGFX89: ; %bb.0: ; %bb
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1
-; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
+; CIGFX89-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CIGFX89-NEXT: s_cbranch_execz .LBB3_2
; CIGFX89-NEXT: ; %bb.1: ; %bb1
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
@@ -120,15 +119,14 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1
-; GFX11-NEXT: s_and_saveexec_b32 s0, s1
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v0
; GFX11-NEXT: s_cbranch_execz .LBB3_2
; GFX11-NEXT: ; %bb.1: ; %bb1
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB3_2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 8dbd6c5d133ea..56ceba258f471 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -11,37 +11,47 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: v_writelane_b32 v5, s30, 0
; CHECK-NEXT: v_writelane_b32 v5, s31, 1
-; CHECK-NEXT: v_writelane_b32 v5, s34, 2
-; CHECK-NEXT: v_writelane_b32 v5, s35, 3
-; CHECK-NEXT: v_writelane_b32 v5, s36, 4
-; CHECK-NEXT: v_writelane_b32 v5, s37, 5
-; CHECK-NEXT: v_writelane_b32 v5, s38, 6
+; CHECK-NEXT: v_writelane_b32 v5, s36, 2
+; CHECK-NEXT: v_writelane_b32 v5, s37, 3
+; CHECK-NEXT: v_writelane_b32 v5, s38, 4
+; CHECK-NEXT: v_writelane_b32 v5, s39, 5
+; CHECK-NEXT: v_writelane_b32 v5, s48, 6
+; CHECK-NEXT: v_writelane_b32 v5, s49, 7
+; CHECK-NEXT: v_writelane_b32 v5, s50, 8
+; CHECK-NEXT: v_writelane_b32 v5, s51, 9
+; CHECK-NEXT: v_writelane_b32 v5, s52, 10
+; CHECK-NEXT: v_writelane_b32 v5, s53, 11
+; CHECK-NEXT: v_writelane_b32 v5, s54, 12
+; CHECK-NEXT: v_writelane_b32 v5, s55, 13
; CHECK-NEXT: s_getpc_b64 s[24:25]
-; CHECK-NEXT: v_writelane_b32 v5, s39, 7
-; CHECK-NEXT: s_movk_i32 s20, 0xf0
-; CHECK-NEXT: s_mov_b32 s21, s24
-; CHECK-NEXT: v_writelane_b32 v5, s48, 8
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT: s_mov_b64 s[20:21], 0
-; CHECK-NEXT: v_writelane_b32 v5, s49, 9
-; CHECK-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
-; CHECK-NEXT: v_writelane_b32 v5, s50, 10
+; CHECK-NEXT: v_writelane_b32 v5, s64, 14
+; CHECK-NEXT: s_movk_i32 s4, 0xf0
+; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: v_writelane_b32 v5, s65, 15
+; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: v_writelane_b32 v5, s66, 16
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v5, s67, 17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s22, 0x130
-; CHECK-NEXT: s_mov_b32 s23, s24
-; CHECK-NEXT: v_writelane_b32 v5, s51, 11
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
-; CHECK-NEXT: s_mov_b32 s28, 0
+; CHECK-NEXT: s_movk_i32 s6, 0x130
+; CHECK-NEXT: s_mov_b32 s7, s24
+; CHECK-NEXT: v_writelane_b32 v5, s68, 18
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; CHECK-NEXT: v_writelane_b32 v5, s69, 19
+; CHECK-NEXT: v_writelane_b32 v5, s70, 20
+; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mov_b32_e32 v2, s20
+; CHECK-NEXT: v_writelane_b32 v5, s71, 21
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
; CHECK-NEXT: v_mov_b32_e32 v3, v1
-; CHECK-NEXT: s_mov_b32 s29, s28
-; CHECK-NEXT: s_mov_b32 s30, s28
-; CHECK-NEXT: s_mov_b32 s31, s28
-; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1
+; CHECK-NEXT: s_mov_b32 s69, s68
+; CHECK-NEXT: s_mov_b32 s70, s68
+; CHECK-NEXT: s_mov_b32 s71, s68
+; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v5, s52, 12
+; CHECK-NEXT: s_mov_b32 s6, 48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v6, s36, 0
; CHECK-NEXT: v_writelane_b32 v6, s37, 1
@@ -49,57 +59,44 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v6, s39, 3
; CHECK-NEXT: v_writelane_b32 v6, s40, 4
; CHECK-NEXT: v_writelane_b32 v6, s41, 5
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1
; CHECK-NEXT: v_writelane_b32 v6, s42, 6
; CHECK-NEXT: v_writelane_b32 v6, s43, 7
; CHECK-NEXT: v_writelane_b32 v6, s44, 8
; CHECK-NEXT: v_writelane_b32 v6, s45, 9
-; CHECK-NEXT: v_writelane_b32 v5, s53, 13
; CHECK-NEXT: v_writelane_b32 v6, s46, 10
-; CHECK-NEXT: v_writelane_b32 v5, s54, 14
; CHECK-NEXT: v_writelane_b32 v6, s47, 11
-; CHECK-NEXT: v_writelane_b32 v5, s55, 15
; CHECK-NEXT: v_writelane_b32 v6, s48, 12
-; CHECK-NEXT: v_writelane_b32 v5, s64, 16
; CHECK-NEXT: v_writelane_b32 v6, s49, 13
-; CHECK-NEXT: v_writelane_b32 v5, s65, 17
; CHECK-NEXT: v_writelane_b32 v6, s50, 14
-; CHECK-NEXT: v_writelane_b32 v5, s66, 18
-; CHECK-NEXT: v_writelane_b32 v6, s51, 15
-; CHECK-NEXT: s_mov_b32 s40, 48
; CHECK-NEXT: s_movk_i32 s56, 0x1f0
-; CHECK-NEXT: s_movk_i32 s34, 0x2f0
-; CHECK-NEXT: s_mov_b32 s41, s24
+; CHECK-NEXT: s_movk_i32 s72, 0x2f0
; CHECK-NEXT: s_mov_b32 s57, s24
-; CHECK-NEXT: s_mov_b32 s35, s24
-; CHECK-NEXT: v_writelane_b32 v5, s67, 19
-; CHECK-NEXT: s_load_dwordx8 s[20:27], s[40:41], 0x0
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_mov_b32 s73, s24
+; CHECK-NEXT: v_writelane_b32 v6, s51, 15
+; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0
; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[34:35], 0x0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: v_writelane_b32 v5, s68, 20
-; CHECK-NEXT: s_xor_b64 s[72:73], vcc, -1
-; CHECK-NEXT: v_writelane_b32 v5, s69, 21
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3
-; CHECK-NEXT: s_and_saveexec_b64 vcc, s[72:73]
-; CHECK-NEXT: s_xor_b64 s[34:35], exec, vcc
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %bb48
-; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_and_b64 vcc, exec, -1
; CHECK-NEXT: .LBB0_2: ; %bb50
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_mov_b32 s29, s28
-; CHECK-NEXT: s_mov_b32 s30, s28
-; CHECK-NEXT: s_mov_b32 s31, s28
+; CHECK-NEXT: s_mov_b32 s69, s68
+; CHECK-NEXT: s_mov_b32 s70, s68
+; CHECK-NEXT: s_mov_b32 s71, s68
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[24:27] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0
@@ -107,11 +104,11 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: .LBB0_3: ; %Flow14
-; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[34:35]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.4: ; %bb32
-; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[72:73]
-; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
+; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5]
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17]
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.5: ; %bb43
; CHECK-NEXT: s_mov_b32 s16, 0
@@ -120,12 +117,12 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_mov_b32_e32 v3, s17
; CHECK-NEXT: s_mov_b32 s18, s16
; CHECK-NEXT: s_mov_b32 s19, s16
-; CHECK-NEXT: image_sample_lz v1, v[2:3], s[4:11], s[16:19] dmask:0x1
+; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_mov_b64 s[4:5], s[36:37]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT: s_mov_b64 s[8:9], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[10:11], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43]
; CHECK-NEXT: v_readlane_b32 s36, v6, 0
; CHECK-NEXT: v_readlane_b32 s44, v6, 8
; CHECK-NEXT: v_readlane_b32 s45, v6, 9
@@ -140,32 +137,32 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s39, v6, 3
; CHECK-NEXT: v_readlane_b32 s40, v6, 4
; CHECK-NEXT: v_readlane_b32 s41, v6, 5
-; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1
; CHECK-NEXT: v_readlane_b32 s42, v6, 6
; CHECK-NEXT: v_readlane_b32 s43, v6, 7
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: s_mov_b64 s[42:43], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
-; CHECK-NEXT: s_mov_b64 s[40:41], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT: s_mov_b64 s[36:37], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: .LBB0_6: ; %Flow12
-; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[14:15]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s6, s8
-; CHECK-NEXT: s_mov_b32 s7, s8
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-NEXT: s_mov_b32 s12, s8
+; CHECK-NEXT: s_mov_b32 s13, s8
+; CHECK-NEXT: v_mov_b32_e32 v1, s12
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
-; CHECK-NEXT: v_mov_b32_e32 v2, s7
+; CHECK-NEXT: v_mov_b32_e32 v2, s13
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
@@ -183,28 +180,28 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: .LBB0_9: ; %Flow13
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT: s_or_b64 exec, exec, s[12:13]
-; CHECK-NEXT: v_readlane_b32 s69, v5, 21
-; CHECK-NEXT: v_readlane_b32 s68, v5, 20
+; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT: v_readlane_b32 s71, v5, 21
+; CHECK-NEXT: v_readlane_b32 s70, v5, 20
+; CHECK-NEXT: v_readlane_b32 s69, v5, 19
+; CHECK-NEXT: v_readlane_b32 s68, v5, 18
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readlane_b32 s67, v5, 19
-; CHECK-NEXT: v_readlane_b32 s66, v5, 18
-; CHECK-NEXT: v_readlane_b32 s65, v5, 17
-; CHECK-NEXT: v_readlane_b32 s64, v5, 16
-; CHECK-NEXT: v_readlane_b32 s55, v5, 15
-; CHECK-NEXT: v_readlane_b32 s54, v5, 14
-; CHECK-NEXT: v_readlane_b32 s53, v5, 13
-; CHECK-NEXT: v_readlane_b32 s52, v5, 12
-; CHECK-NEXT: v_readlane_b32 s51, v5, 11
-; CHECK-NEXT: v_readlane_b32 s50, v5, 10
-; CHECK-NEXT: v_readlane_b32 s49, v5, 9
-; CHECK-NEXT: v_readlane_b32 s48, v5, 8
-; CHECK-NEXT: v_readlane_b32 s39, v5, 7
-; CHECK-NEXT: v_readlane_b32 s38, v5, 6
-; CHECK-NEXT: v_readlane_b32 s37, v5, 5
-; CHECK-NEXT: v_readlane_b32 s36, v5, 4
-; CHECK-NEXT: v_readlane_b32 s35, v5, 3
-; CHECK-NEXT: v_readlane_b32 s34, v5, 2
+; CHECK-NEXT: v_readlane_b32 s67, v5, 17
+; CHECK-NEXT: v_readlane_b32 s66, v5, 16
+; CHECK-NEXT: v_readlane_b32 s65, v5, 15
+; CHECK-NEXT: v_readlane_b32 s64, v5, 14
+; CHECK-NEXT: v_readlane_b32 s55, v5, 13
+; CHECK-NEXT: v_readlane_b32 s54, v5, 12
+; CHECK-NEXT: v_readlane_b32 s53, v5, 11
+; CHECK-NEXT: v_readlane_b32 s52, v5, 10
+; CHECK-NEXT: v_readlane_b32 s51, v5, 9
+; CHECK-NEXT: v_readlane_b32 s50, v5, 8
+; CHECK-NEXT: v_readlane_b32 s49, v5, 7
+; CHECK-NEXT: v_readlane_b32 s48, v5, 6
+; CHECK-NEXT: v_readlane_b32 s39, v5, 5
+; CHECK-NEXT: v_readlane_b32 s38, v5, 4
+; CHECK-NEXT: v_readlane_b32 s37, v5, 3
+; CHECK-NEXT: v_readlane_b32 s36, v5, 2
; CHECK-NEXT: v_readlane_b32 s31, v5, 1
; CHECK-NEXT: v_readlane_b32 s30, v5, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
diff --git a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
index 021c845d5ea6b..d75e9932bcd82 100644
--- a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
@@ -7,9 +7,8 @@ define amdgpu_ps float @kill_true(i1 %.not) {
; CHECK-NEXT: s_mov_b64 s[0:1], exec
; CHECK-NEXT: s_wqm_b64 exec, exec
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
-; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %if1
; CHECK-NEXT: s_mov_b32 s4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index e70dc8f7a6576..e64ec9956860d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -133,11 +133,10 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG: ; %bb.0: ; %entry
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX8-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2
; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1
@@ -210,10 +209,9 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_2
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
@@ -266,10 +264,9 @@ define void @func_uses_lds_multi(i1 %cond) {
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
-; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_2
; SDAG-NEXT: ; %bb.1: ; %bb1
; SDAG-NEXT: v_mov_b32_e32 v0, 1
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index 308ca34058f59..e37dcf60506be 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -100,26 +100,26 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1
; GCN-NEXT: s_mov_b32 s10, 1
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_branch .LBB2_2
; GCN-NEXT: .LBB2_1: ; %endif
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5]
-; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], exec, vcc
+; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
; GCN-NEXT: s_add_i32 s10, s10, 1
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB2_4
; GCN-NEXT: .LBB2_2: ; %loop
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s10, v0
-; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0
+; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB2_1
; GCN-NEXT: ; %bb.3: ; %then
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT: s_nop 2
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
; GCN-NEXT: s_branch .LBB2_1
; GCN-NEXT: .LBB2_4: ; %loopexit
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index ba0f5cbf0a5f6..34a9624cb19eb 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -10,10 +10,9 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
; CHECK-NEXT: v_and_b32_e32 v3, 1, v3
-; CHECK-NEXT: s_mov_b32 s5, 0
-; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
+; CHECK-NEXT: s_mov_b32 s6, 0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 1, v1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; CHECK-NEXT: s_xor_b32 s6, s4, -1
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .p2align 6
@@ -24,19 +23,19 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; j lastloop entry
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_or_b32 s5, s4, s5
-; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 s6, s5, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB0_8
; CHECK-NEXT: .LBB0_3: ; %for.body33
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB0_6 Depth 2
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_and_saveexec_b32 s7, s6
+; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
@@ -46,21 +45,21 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_5: ; %if.end118
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_add_i32 s9, s9, 4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; backedge
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2
-; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v3, v0
-; CHECK-NEXT: s_or_b32 s8, s4, s8
+; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0
+; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
; CHECK-NEXT: .LBB0_6: ; %for.body51
; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_mov_b32_e32 v1, 1
-; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_5
; CHECK-NEXT: ; %bb.7: ; %if.then112
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
@@ -71,7 +70,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: s_branch .LBB0_5
; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader
; CHECK-NEXT: s_inst_prefetch 0x2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo
; CHECK-NEXT: .LBB0_9: ; %for.body159
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 13f8eff94f86b..34de1e48bfb59 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT
; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
@@ -15,34 +15,39 @@ define void @nested_inf_loop(i1 %0, i1 %1) {
; OPT-NEXT: ret void
;
; ISA-LABEL: nested_inf_loop:
-; ISA-NEXT: %bb.0: ; %BB
-; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ISA-NEXT: v_and_b32_e32 v1, 1, v1
-; ISA-NEXT: v_and_b32_e32 v0, 1, v0
-; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
-; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1
-; ISA-NEXT: s_mov_b64 s[8:9], 0
-; ISA-NEXT: .LBB0_1: ; %BB1
-; ISA: s_and_b64 s[10:11], exec, s[6:7]
-; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; ISA-NEXT: s_cbranch_execnz .LBB0_1
-; ISA-NEXT: %bb.2: ; %BB2
-; ISA: s_or_b64 exec, exec, s[8:9]
-; ISA-NEXT: s_mov_b64 s[8:9], 0
-; ISA-NEXT: .LBB0_3: ; %BB4
-; ISA: s_and_b64 s[10:11], exec, s[4:5]
-; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; ISA-NEXT: s_cbranch_execnz .LBB0_3
-; ISA-NEXT: %bb.4: ; %loop.exit.guard
-; ISA: s_or_b64 exec, exec, s[8:9]
-; ISA-NEXT: s_mov_b64 vcc, 0
-; ISA-NEXT: s_mov_b64 s[8:9], 0
-; ISA-NEXT: s_branch .LBB0_1
-; ISA-NEXT: %bb.5: ; %DummyReturnBlock
-; ISA-NEXT: s_setpc_b64 s[30:31]
+; ISA: ; %bb.0: ; %BB
+; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT: v_and_b32_e32 v1, 1, v1
+; ISA-NEXT: v_and_b32_e32 v0, 1, v0
+; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
+; ISA-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, v0
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: .LBB0_1: ; %BB1
+; ISA-NEXT: ; =>This Loop Header: Depth=1
+; ISA-NEXT: ; Child Loop BB0_3 Depth 2
+; ISA-NEXT: s_and_b64 s[10:11], exec, s[6:7]
+; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_cbranch_execnz .LBB0_1
+; ISA-NEXT: ; %bb.2: ; %BB2
+; ISA-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; ISA-NEXT: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: .LBB0_3: ; %BB4
+; ISA-NEXT: ; Parent Loop BB0_1 Depth=1
+; ISA-NEXT: ; => This Inner Loop Header: Depth=2
+; ISA-NEXT: s_and_b64 s[10:11], exec, s[4:5]
+; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_cbranch_execnz .LBB0_3
+; ISA-NEXT: ; %bb.4: ; %loop.exit.guard
+; ISA-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; ISA-NEXT: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_mov_b64 vcc, 0
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: s_branch .LBB0_1
+; ISA-NEXT: ; %bb.5: ; %DummyReturnBlock
+; ISA-NEXT: s_setpc_b64 s[30:31]
BB:
br label %BB1
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
index d34769ad0fcf0..761ff7786b98e 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
@@ -14,10 +14,9 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: global_load_dwordx2 v[1:2], v[1:2], off
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB0_3
; GCN-NEXT: ; %bb.1: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
>From 3ba0fb743100a16795faaad18407898e902f5769 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Mon, 31 Mar 2025 16:34:36 +0200
Subject: [PATCH 2/2] changed trunc in pattern to oneuse
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 4 ++--
llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 +++++++++++++
.../codegen-prepare-addrspacecast-non-null.ll | 16 ++++++++--------
3 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9024ae4f1cf6..f770cf3014579 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3124,13 +3124,13 @@ def IMMBitSelConst : SDNodeXForm<imm, [{
// Handle the VALU case.
def : GCNPat <
- (i1 (xor (i1 (DivergentUnaryFrag<trunc> i32:$a)), -1)),
+ (i1 (xor (i1 (DivergentUnaryFrag_oneuse<trunc>i32:$a)), -1)),
(V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1), i32:$a),
(i32 1))
>;
def : GCNPat <
- (i1 (xor (i1 (DivergentUnaryFrag<trunc> i64:$a)), -1)),
+ (i1 (xor (i1 (DivergentUnaryFrag_oneuse<trunc> i64:$a)), -1)),
(V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1),
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index d6ad01c8f9b35..86e30096f5423 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1255,6 +1255,19 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
list<dag> ret = [!con(Outs, (set Ins))];
}
+class DivergentUnaryFrag_oneuse<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+ let HasOneUse = 1;
+}
+
class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0),
(Op $src0),
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 861319bf560fa..3216e71e6221a 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -207,22 +207,22 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; DAGISEL-ASM: ; %bb.0: ; %entry
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
-; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc
; DAGISEL-ASM-NEXT: ; %bb.1: ; %then
; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
-; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
+; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1
; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0
-; DAGISEL-ASM-NEXT: s_mov_b64 s[6:7], src_private_base
+; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally
; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
-; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, vcc
-; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s7
+; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7]
+; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5]
More information about the llvm-commits
mailing list