[llvm] WIP: [AMDGPU] Use s_cselect_b32 for uniform select of f32 values (PR #111688)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 9 07:30:15 PDT 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/111688
None
>From 1467952e4009289a083126d773aa9506f3efc04a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 9 Oct 2024 15:29:32 +0100
Subject: [PATCH] WIP: [AMDGPU] Use s_cselect_b32 for uniform select of f32
values
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 3 -
.../AMDGPU/bug-sdag-emitcopyfromreg.ll | 28 +-
.../CodeGen/AMDGPU/dagcombine-setcc-select.ll | 16 +-
.../CodeGen/AMDGPU/extract_vector_dynelt.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fdiv.ll | 60 +-
.../CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 81 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 23 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 194 +--
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 49 +-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 943 +++++++-------
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 30 +-
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 136 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 434 ++++---
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 1138 +++++++++--------
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 1138 +++++++++--------
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 587 +++++----
llvm/test/CodeGen/AMDGPU/llvm.round.ll | 471 ++++---
llvm/test/CodeGen/AMDGPU/rsq.f32.ll | 31 +-
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 84 +-
.../AMDGPU/splitkit-getsubrangeformask.ll | 4 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 122 +-
llvm/test/CodeGen/AMDGPU/vselect.ll | 136 +-
llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll | 4 +-
23 files changed, 2994 insertions(+), 2732 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9da27a7c7ee7d6..71df0cf1bd17d8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1907,9 +1907,6 @@ let AddedComplexity = 20 in {
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;
- // TODO: The predicate should not be necessary, but enabling this pattern for
- // all subtargets generates worse code in some cases.
- let OtherPredicates = [HasPseudoScalarTrans] in
def : GCNPat<
(f32 (UniformSelect f32:$src0, f32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
index 07816f1ed6a650..7841112cf9260d 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
@@ -9,35 +9,33 @@ define void @f(i32 %arg, ptr %ptr) {
; ISA-NEXT: s_mov_b64 s[4:5], 0
; ISA-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v0
; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; ISA-NEXT: v_mov_b32_e32 v6, 0
+; ISA-NEXT: v_mov_b32_e32 v5, 0
; ISA-NEXT: s_waitcnt lgkmcnt(0)
; ISA-NEXT: s_lshr_b32 s6, s5, 1
; ISA-NEXT: s_lshr_b32 s7, 1, s4
; ISA-NEXT: s_cmp_lg_u32 s4, 0
-; ISA-NEXT: s_cselect_b32 s4, -1, 0
-; ISA-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
-; ISA-NEXT: s_and_b32 s4, s4, exec_lo
; ISA-NEXT: s_cselect_b32 s4, s6, 0
; ISA-NEXT: s_cselect_b32 s6, s7, 0
; ISA-NEXT: s_cselect_b32 s5, s5, 0
-; ISA-NEXT: v_cvt_f32_i32_e32 v3, s4
-; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s6
-; ISA-NEXT: v_cvt_f32_i32_e32 v5, s5
+; ISA-NEXT: v_cvt_f32_i32_e32 v0, s4
+; ISA-NEXT: v_cvt_f32_ubyte0_e32 v3, s6
+; ISA-NEXT: v_cvt_f32_i32_e32 v4, s5
; ISA-NEXT: s_mov_b32 s4, 0
+; ISA-NEXT: s_cselect_b32 s5, 1.0, 0
; ISA-NEXT: .LBB0_1: ; %bb14
; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
-; ISA-NEXT: v_mov_b32_e32 v7, v6
-; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo
-; ISA-NEXT: s_or_b32 s4, s5, s4
-; ISA-NEXT: v_add_f32_e32 v6, v7, v0
-; ISA-NEXT: v_add_f32_e64 v6, v6, |v3|
-; ISA-NEXT: v_add_f32_e32 v6, v6, v4
-; ISA-NEXT: v_add_f32_e32 v6, v6, v5
+; ISA-NEXT: v_mov_b32_e32 v6, v5
+; ISA-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; ISA-NEXT: s_or_b32 s4, s6, s4
+; ISA-NEXT: v_add_f32_e32 v5, s5, v6
+; ISA-NEXT: v_add_f32_e64 v5, v5, |v0|
+; ISA-NEXT: v_add_f32_e32 v5, v5, v3
+; ISA-NEXT: v_add_f32_e32 v5, v5, v4
; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; ISA-NEXT: s_cbranch_execnz .LBB0_1
; ISA-NEXT: ; %bb.2: ; %bb21
; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; ISA-NEXT: flat_store_dword v[1:2], v7
+; ISA-NEXT: flat_store_dword v[1:2], v6
; ISA-NEXT: s_waitcnt lgkmcnt(0)
; ISA-NEXT: s_setpc_b64 s[30:31]
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
index 8fa0068a237cd5..41d6c80a3ea449 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
@@ -7,7 +7,9 @@ define amdgpu_kernel void @eq_t(float %x) {
; GCN-NEXT: s_load_dword s0, s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 4.0, 2.0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_endpgm
%c1 = fcmp olt float %x, 1.0
@@ -24,7 +26,9 @@ define amdgpu_kernel void @ne_t(float %x) {
; GCN-NEXT: s_load_dword s0, s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 4.0, 2.0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_endpgm
%c1 = fcmp olt float %x, 1.0
@@ -41,7 +45,9 @@ define amdgpu_kernel void @eq_f(float %x) {
; GCN-NEXT: s_load_dword s0, s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 4.0, 2.0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_endpgm
%c1 = fcmp olt float %x, 1.0
@@ -58,7 +64,9 @@ define amdgpu_kernel void @ne_f(float %x) {
; GCN-NEXT: s_load_dword s0, s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0
-; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 4.0, 2.0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_endpgm
%c1 = fcmp olt float %x, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 54ec7578700df8..eb918422c0c8cb 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -8,16 +8,14 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s4, 1
-; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT: s_cselect_b32 s2, 1.0, 0
; GCN-NEXT: s_cmp_lg_u32 s4, 2
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[2:3]
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s2, s2, 2.0
; GCN-NEXT: s_cmp_lg_u32 s4, 3
-; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v2, 4.0, v0, vcc
+; GCN-NEXT: s_cselect_b32 s2, s2, 4.0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
entry:
@@ -145,10 +143,10 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s4, 1
-; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT: s_cselect_b32 s2, 1.0, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 93105e57a59187..c59035f8d228dc 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -336,18 +336,19 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
; GFX67: ; %bb.0: ; %entry
; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000
-; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0
-; GFX67-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX67-NEXT: v_rcp_f32_e32 v1, v1
+; GFX67-NEXT: v_cmp_gt_f32_e64 s[4:5], |s3|, v0
+; GFX67-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX67-NEXT: s_cselect_b32 s8, 0x2f800000, 1.0
+; GFX67-NEXT: v_mov_b32_e32 v0, s8
+; GFX67-NEXT: v_mul_f32_e32 v0, s3, v0
+; GFX67-NEXT: v_rcp_f32_e32 v0, v0
; GFX67-NEXT: s_mov_b32 s4, s0
; GFX67-NEXT: s_mov_b32 s5, s1
-; GFX67-NEXT: v_mul_f32_e32 v1, s2, v1
-; GFX67-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX67-NEXT: v_mul_f32_e32 v0, s8, v0
; GFX67-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
@@ -355,14 +356,15 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1
-; GFX8-NEXT: v_mul_f32_e32 v2, v0, v1
+; GFX8-NEXT: v_cmp_gt_f32_e64 s[4:5], |s3|, v0
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT: s_cselect_b32 s4, 0x2f800000, 1.0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mul_f32_e32 v0, s3, v0
+; GFX8-NEXT: v_rcp_f32_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX8-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -371,30 +373,32 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
; GFX10-LABEL: s_fdiv_25ulp_f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
-; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0
-; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, 0x2f800000, 1.0
+; GFX10-NEXT: v_mul_f32_e64 v0, s7, s0
+; GFX10-NEXT: v_rcp_f32_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_25ulp_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
-; GFX11-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT: s_cselect_b32 s4, 0x2f800000, 1.0
+; GFX11-NEXT: v_mul_f32_e64 v0, s3, s4
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v1, s2, v1
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX11-NEXT: v_mul_f32_e32 v0, s4, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index 8e43bd890a8fa4..b49b6bfa65e188 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -371,25 +371,28 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-FLUSH: ; %bb.0:
; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000
-; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000
; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0
-; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0
-; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s0, v3
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s3, v5
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v6, v1
+; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 s[6:7], |s0|, v0
+; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 s[8:9], |s3|, v0
+; GCN-FLUSH-NEXT: s_and_b64 s[6:7], s[6:7], exec
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, s1
+; GCN-FLUSH-NEXT: s_cselect_b32 s1, 0x2f800000, 1.0
+; GCN-FLUSH-NEXT: s_and_b64 s[6:7], s[8:9], exec
+; GCN-FLUSH-NEXT: s_cselect_b32 s6, 0x2f800000, 1.0
+; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, s1
+; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, s6
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s0, v0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, s3, v2
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GCN-FLUSH-NEXT: v_rcp_f32_e64 v2, -s2
; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v0, v0
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v6, -2.0, v6
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v6
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, -2.0, v3
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s1, v0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, s6, v3
; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT: s_endpgm
%load = load <4 x float>, ptr addrspace(1) %arg, align 16
@@ -435,27 +438,30 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-FLUSH: ; %bb.0:
; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000
-; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x2f800000
; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0
-; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc
-; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v5, s0, v3
-; GCN-FLUSH-NEXT: v_mul_f32_e64 v6, -s0, v3
-; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v7, 1.0, v2, vcc
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v5, v5
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v6, v6
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s3, v7
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v8, v0
+; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 s[6:7], |s0|, v0
+; GCN-FLUSH-NEXT: s_and_b64 s[6:7], s[6:7], exec
; GCN-FLUSH-NEXT: v_rcp_f32_e64 v1, -s1
+; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 s[8:9], |s3|, v0
+; GCN-FLUSH-NEXT: s_cselect_b32 s1, 0x2f800000, 1.0
+; GCN-FLUSH-NEXT: s_and_b64 s[6:7], s[8:9], exec
+; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, s1
+; GCN-FLUSH-NEXT: s_cselect_b32 s6, 0x2f800000, 1.0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v2, s0, v0
+; GCN-FLUSH-NEXT: v_mul_f32_e64 v0, -s0, v0
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-FLUSH-NEXT: v_mov_b32_e32 v3, s6
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, s3, v3
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
+; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v0, v2
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, s2
-; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v6, v5
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0
-; GCN-FLUSH-NEXT: v_add_f32_e32 v3, v8, v8
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s1, v0
+; GCN-FLUSH-NEXT: v_add_f32_e32 v3, v3, v3
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, s6, v3
; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GCN-FLUSH-NEXT: s_endpgm
%load = load <4 x float>, ptr addrspace(1) %arg, align 16
@@ -490,18 +496,19 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) {
; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000
-; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000
-; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0
+; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s5, s[0:1], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0
-; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s2, v0
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s4, v1
-; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 s[2:3], |s5|, v0
+; GCN-FLUSH-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GCN-FLUSH-NEXT: s_cselect_b32 s2, 0x2f800000, 1.0
+; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, s2
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s5, v0
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s4, v0
+; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s2, v0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float %num, %load, !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad0..bb66f9e093f203 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -490,11 +490,13 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0
+; SI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SAFE-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-SAFE-NEXT: v_mov_b32_e32 v0, s0
; SI-SAFE-NEXT: ; return to shader part epilog
;
; SI-NSZ-LABEL: fneg_fadd_0_f16:
@@ -520,9 +522,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-NSZ-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-NSZ-NEXT: v_mov_b32_e32 v0, s0
; SI-NSZ-NEXT: ; return to shader part epilog
;
; VI-SAFE-LABEL: fneg_fadd_0_f16:
@@ -599,18 +602,18 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
; SI-SAFE: ; %bb.0: ; %.entry
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0
; SI-SAFE-NEXT: s_brev_b32 s0, 1
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0
; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SAFE-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-SAFE-NEXT: v_mov_b32_e32 v0, s0
; SI-SAFE-NEXT: ; return to shader part epilog
;
; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
; SI-NSZ: ; %bb.0: ; %.entry
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
-; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0
@@ -618,7 +621,9 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-NSZ-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-NSZ-NEXT: v_mov_b32_e32 v0, s0
; SI-NSZ-NEXT: ; return to shader part epilog
;
; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 74e2b9ea714258..951df5dc37d7fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -193,12 +193,14 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
; SI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; SI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, s0
+; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; SI-SAFE-NEXT: v_mov_b32_e32 v2, s0
; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SAFE-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-SAFE-NEXT: v_mov_b32_e32 v0, s0
; SI-SAFE-NEXT: ; return to shader part epilog
;
; SI-NSZ-LABEL: fneg_fadd_0_f32:
@@ -217,12 +219,14 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0
-; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0
+; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; SI-NSZ-NEXT: v_mov_b32_e32 v2, s0
; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-NSZ-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-NSZ-NEXT: v_mov_b32_e32 v0, s0
; SI-NSZ-NEXT: ; return to shader part epilog
;
; VI-SAFE-LABEL: fneg_fadd_0_f32:
@@ -239,14 +243,16 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
; VI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v1
; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v2, v3
-; VI-SAFE-NEXT: v_mov_b32_e32 v2, s0
-; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
; VI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
; VI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0
+; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SAFE-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; VI-SAFE-NEXT: v_mov_b32_e32 v0, s0
; VI-SAFE-NEXT: ; return to shader part epilog
;
; VI-NSZ-LABEL: fneg_fadd_0_f32:
@@ -263,14 +269,16 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
; VI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v1
; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3
-; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0
-; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0
+; VI-NSZ-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
+; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-NSZ-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; VI-NSZ-NEXT: v_mov_b32_e32 v0, s0
; VI-NSZ-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv float 1.000000e+00, %tmp6
@@ -293,9 +301,10 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6
; SI-SAFE: ; %bb.0: ; %.entry
; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, 0, s0
; SI-SAFE-NEXT: s_brev_b32 s0, 1
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SAFE-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; SI-SAFE-NEXT: v_mov_b32_e32 v0, s0
; SI-SAFE-NEXT: ; return to shader part epilog
;
; GCN-NSZ-LABEL: fneg_fadd_0_nsz_f32:
@@ -303,11 +312,13 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6
; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1
; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0
; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0
+; GCN-NSZ-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; GCN-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec
+; GCN-NSZ-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; GCN-NSZ-NEXT: v_mov_b32_e32 v0, s0
; GCN-NSZ-NEXT: ; return to shader part epilog
;
; VI-SAFE-LABEL: fneg_fadd_0_nsz_f32:
@@ -316,11 +327,13 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6
; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
; VI-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0
; VI-SAFE-NEXT: v_add_f32_e32 v0, 0, v0
+; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SAFE-NEXT: s_cselect_b32 s0, 0, 0x7fc00000
+; VI-SAFE-NEXT: v_mov_b32_e32 v0, s0
; VI-SAFE-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv afn float 1.000000e+00, %tmp6
@@ -2801,13 +2814,14 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitcmp1_b32 s1, 0
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
+; SI-NEXT: s_and_b32 s1, 1, s1
+; SI-NEXT: s_cselect_b32 s0, 0, s0
+; SI-NEXT: s_xor_b32 s0, s0, 0x80000000
+; SI-NEXT: s_cmp_eq_u32 s1, 1
+; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
@@ -2815,13 +2829,14 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; VI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
+; VI-NEXT: s_and_b32 s1, 1, s1
+; VI-NEXT: s_cselect_b32 s0, 0, s0
+; VI-NEXT: s_xor_b32 s0, s0, 0x80000000
+; VI-NEXT: s_cmp_eq_u32 s1, 1
+; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, float 0.0, float %arg
@@ -3019,17 +3034,16 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd
-; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitcmp1_b32 s4, 0
-; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; SI-NEXT: s_and_b32 s4, 1, s4
+; SI-NEXT: s_xor_b32 s1, s1, 0x80000000
+; SI-NEXT: s_cmp_eq_u32 s4, 1
+; SI-NEXT: s_cselect_b32 s1, 0x80000000, s1
; SI-NEXT: s_cselect_b32 s0, 0, s0
+; SI-NEXT: s_cselect_b32 s1, 0, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -3039,17 +3053,16 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
-; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s4, 0
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT: s_and_b32 s4, 1, s4
+; VI-NEXT: s_xor_b32 s1, s1, 0x80000000
+; VI-NEXT: s_cmp_eq_u32 s4, 1
+; VI-NEXT: s_cselect_b32 s1, 0x80000000, s1
; VI-NEXT: s_cselect_b32 s0, 0, s0
+; VI-NEXT: s_cselect_b32 s1, 0, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -3087,7 +3100,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
; SI-NEXT: s_bitcmp1_b32 s4, 16
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
+; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
@@ -3218,17 +3232,18 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
-; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitcmp1_b32 s6, 0
-; SI-NEXT: v_mov_b32_e32 v1, s4
-; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3]
+; SI-NEXT: s_and_b32 s2, 1, s6
+; SI-NEXT: s_xor_b32 s3, s5, 0x80000000
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: s_cmp_eq_u32 s2, 1
+; SI-NEXT: s_cselect_b32 s2, 0x80000000, s4
+; SI-NEXT: s_cselect_b32 s3, 0x80000000, s3
+; SI-NEXT: s_cselect_b32 s3, 0, s3
+; SI-NEXT: s_cselect_b32 s2, 0, s2
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -3237,17 +3252,18 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
-; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s6, 0
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3]
+; VI-NEXT: s_and_b32 s2, 1, s6
+; VI-NEXT: s_xor_b32 s3, s5, 0x80000000
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: s_cmp_eq_u32 s2, 1
+; VI-NEXT: s_cselect_b32 s2, 0x80000000, s4
+; VI-NEXT: s_cselect_b32 s3, 0x80000000, s3
+; VI-NEXT: s_cselect_b32 s3, 0, s3
+; VI-NEXT: s_cselect_b32 s2, 0, s2
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -3281,13 +3297,14 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitcmp1_b32 s1, 0
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
+; SI-NEXT: s_and_b32 s1, 1, s1
+; SI-NEXT: s_cselect_b32 s0, 0, s0
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: s_cmp_eq_u32 s1, 1
+; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
@@ -3295,13 +3312,14 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; VI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
+; VI-NEXT: s_and_b32 s1, 1, s1
+; VI-NEXT: s_cselect_b32 s0, 0, s0
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: s_cmp_eq_u32 s1, 1
+; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, float 0.0, float %arg
@@ -3331,13 +3349,14 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitcmp1_b32 s1, 0
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
+; SI-NEXT: s_and_b32 s1, 1, s1
+; SI-NEXT: s_cselect_b32 s0, 0, s0
+; SI-NEXT: s_bitset1_b32 s0, 31
+; SI-NEXT: s_cmp_eq_u32 s1, 1
+; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dword v[0:1], v2
; SI-NEXT: s_endpgm
;
@@ -3345,13 +3364,14 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; VI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
+; VI-NEXT: s_and_b32 s1, 1, s1
+; VI-NEXT: s_cselect_b32 s0, 0, s0
+; VI-NEXT: s_bitset1_b32 s0, 31
+; VI-NEXT: s_cmp_eq_u32 s1, 1
+; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, float 0.0, float %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 98b17bbaa0a959..26b6afb0bbdf1e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1479,18 +1479,16 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_bitcmp1_b32 s8, 0
-; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_and_b32 s6, 1, s8
+; GFX7-NEXT: s_cselect_b32 s7, s1, s3
+; GFX7-NEXT: s_xor_b32 s7, s7, 0x80000000
+; GFX7-NEXT: s_cmp_eq_u32 s6, 1
; GFX7-NEXT: s_cselect_b32 s1, s1, s3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_cselect_b32 s1, s7, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
@@ -1502,38 +1500,35 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bitcmp1_b32 s8, 0
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_and_b32 s6, 1, s8
+; GFX9-NEXT: s_cselect_b32 s7, s1, s3
+; GFX9-NEXT: s_xor_b32 s7, s7, 0x80000000
+; GFX9-NEXT: s_cmp_eq_u32 s6, 1
; GFX9-NEXT: s_cselect_b32 s1, s1, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_cselect_b32 s0, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
+; GFX9-NEXT: s_cselect_b32 s1, s7, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: multiple_uses_fneg_select_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x10
+; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x18
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: s_bitcmp1_b32 s8, 0
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo
-; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT: s_and_b32 s2, 1, s8
+; GFX11-NEXT: s_cselect_b32 s3, s5, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_cmp_eq_u32 s2, 1
; GFX11-NEXT: s_cselect_b32 s2, s5, s7
-; GFX11-NEXT: s_cselect_b32 s3, s4, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v1, s2, -v0, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: s_cselect_b32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s2, s3, s2
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f1f4abe580c002..834e3b204cac6d 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -11,70 +11,43 @@
define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-LABEL: extract_w_offset:
; GENERIC: ; %bb.0: ; %entry
-; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
-; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_add_i32 s6, s4, 1
-; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
-; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 10
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 14
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT: s_add_i32 s4, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b32 s2, 2.0, 1.0
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40400000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 4.0
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40c00000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40e00000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41000000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41100000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41200000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41300000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41400000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41500000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41600000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41700000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s2, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -625,69 +598,42 @@ entry:
define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-LABEL: extract_wo_offset:
; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GENERIC-NEXT: s_load_dword s6, s[2:3], 0xb
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
-; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
-; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 10
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 14
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s6, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b32 s2, 2.0, 1.0
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40400000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 4.0
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40c00000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40e00000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41000000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41100000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41200000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41300000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41400000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41500000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41600000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x41700000
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s2, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -1897,75 +1843,74 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_mov_b32 s5, 0x40400000
+; GENERIC-NEXT: s_mov_b32 s6, 0x41000000
+; GENERIC-NEXT: s_mov_b32 s7, 0x40e00000
+; GENERIC-NEXT: s_mov_b32 s8, 0x40c00000
+; GENERIC-NEXT: s_mov_b32 s9, 0x40a00000
+; GENERIC-NEXT: s_mov_b32 s10, 0x41400000
+; GENERIC-NEXT: s_mov_b32 s11, 0x41300000
+; GENERIC-NEXT: s_mov_b32 s12, 0x41200000
+; GENERIC-NEXT: s_mov_b32 s13, 0x41100000
+; GENERIC-NEXT: s_mov_b32 s14, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s15, 0x41700000
+; GENERIC-NEXT: s_mov_b32 s16, 0x41600000
+; GENERIC-NEXT: s_mov_b32 s17, 0x41500000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: s_add_i32 s4, s4, 1
; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 4.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s5
; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v3, s18
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 2.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v2, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, 1.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v1, s18
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s6
; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v0, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s7
; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s8
; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s9
; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s10
; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s11
; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s12
; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s13
; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s14
; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s15
; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s7, 0x41880000, s16
; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: s_cselect_b32 s4, 0x41880000, s17
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: v_mov_b32_e32 v5, s7
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -2252,76 +2197,75 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou
; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_mov_b32 s5, 0x40400000
+; GENERIC-NEXT: s_mov_b32 s6, 0x41000000
+; GENERIC-NEXT: s_mov_b32 s7, 0x40e00000
+; GENERIC-NEXT: s_mov_b32 s8, 0x40c00000
+; GENERIC-NEXT: s_mov_b32 s9, 0x40a00000
+; GENERIC-NEXT: s_mov_b32 s10, 0x41400000
+; GENERIC-NEXT: s_mov_b32 s11, 0x41300000
+; GENERIC-NEXT: s_mov_b32 s12, 0x41200000
+; GENERIC-NEXT: s_mov_b32 s13, 0x41100000
+; GENERIC-NEXT: s_mov_b32 s14, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s15, 0x41700000
+; GENERIC-NEXT: s_mov_b32 s16, 0x41600000
+; GENERIC-NEXT: s_mov_b32 s17, 0x41500000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: s_and_b32 s4, s4, 0xffff
; GENERIC-NEXT: s_add_i32 s4, s4, 1
; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 4.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s5
; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v3, s18
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 2.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v2, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, 1.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v1, s18
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s6
; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v0, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s7
; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s8
; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s9
; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s10
; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s11
; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s12
; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s13
; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s14
; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s15
; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s7, 0x41880000, s16
; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: s_cselect_b32 s4, 0x41880000, s17
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: v_mov_b32_e32 v5, s7
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -2609,76 +2553,75 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out,
; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_mov_b32 s5, 0x40400000
+; GENERIC-NEXT: s_mov_b32 s6, 0x41000000
+; GENERIC-NEXT: s_mov_b32 s7, 0x40e00000
+; GENERIC-NEXT: s_mov_b32 s8, 0x40c00000
+; GENERIC-NEXT: s_mov_b32 s9, 0x40a00000
+; GENERIC-NEXT: s_mov_b32 s10, 0x41400000
+; GENERIC-NEXT: s_mov_b32 s11, 0x41300000
+; GENERIC-NEXT: s_mov_b32 s12, 0x41200000
+; GENERIC-NEXT: s_mov_b32 s13, 0x41100000
+; GENERIC-NEXT: s_mov_b32 s14, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s15, 0x41700000
+; GENERIC-NEXT: s_mov_b32 s16, 0x41600000
+; GENERIC-NEXT: s_mov_b32 s17, 0x41500000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: s_sext_i32_i16 s4, s4
; GENERIC-NEXT: s_add_i32 s4, s4, 1
; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 4.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s5
; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v3, s18
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 2.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v2, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, 1.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v1, s18
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s6
; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v0, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s7
; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s8
; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s9
; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s10
; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s11
; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s12
; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s13
; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s14
; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s15
; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s7, 0x41880000, s16
; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: s_cselect_b32 s4, 0x41880000, s17
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: v_mov_b32_e32 v5, s7
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -2973,74 +2916,73 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_mov_b32 s5, 0x40400000
+; GENERIC-NEXT: s_mov_b32 s6, 0x41000000
+; GENERIC-NEXT: s_mov_b32 s7, 0x40e00000
+; GENERIC-NEXT: s_mov_b32 s8, 0x40c00000
+; GENERIC-NEXT: s_mov_b32 s9, 0x40a00000
+; GENERIC-NEXT: s_mov_b32 s10, 0x41400000
+; GENERIC-NEXT: s_mov_b32 s11, 0x41300000
+; GENERIC-NEXT: s_mov_b32 s12, 0x41200000
+; GENERIC-NEXT: s_mov_b32 s13, 0x41100000
+; GENERIC-NEXT: s_mov_b32 s14, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s15, 0x41700000
+; GENERIC-NEXT: s_mov_b32 s16, 0x41600000
+; GENERIC-NEXT: s_mov_b32 s17, 0x41500000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 4.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s5
; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v3, s18
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 2.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v2, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, 1.0
; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v1, s18
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s6
; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v0, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s7
; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s8
; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s9
; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s10
; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s11
; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s12
; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s13
; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s14
; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 0x41880000, s15
; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cselect_b32 s7, 0x41880000, s16
; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: s_cselect_b32 s4, 0x41880000, s17
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s6
+; GENERIC-NEXT: v_mov_b32_e32 v6, s5
+; GENERIC-NEXT: v_mov_b32_e32 v5, s7
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GENERIC-NEXT: s_endpgm
;
@@ -6625,128 +6567,132 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000
-; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000
-; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000
-; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000
-; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000
-; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000
-; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000
-; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000
-; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000
-; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000
-; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000
-; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000
-; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000
+; GENERIC-NEXT: s_mov_b32 s5, 0x41880000
+; GENERIC-NEXT: s_mov_b32 s6, 0x41600000
+; GENERIC-NEXT: s_mov_b32 s7, 0x41700000
+; GENERIC-NEXT: s_mov_b32 s8, 0x41800000
+; GENERIC-NEXT: s_mov_b32 s9, 0x41100000
+; GENERIC-NEXT: s_mov_b32 s10, 0x41200000
+; GENERIC-NEXT: s_mov_b32 s11, 0x41300000
+; GENERIC-NEXT: s_mov_b32 s12, 0x41400000
+; GENERIC-NEXT: s_mov_b32 s13, 0x40a00000
+; GENERIC-NEXT: s_mov_b32 s14, 0x40c00000
+; GENERIC-NEXT: s_mov_b32 s15, 0x40e00000
+; GENERIC-NEXT: s_mov_b32 s16, 0x41000000
+; GENERIC-NEXT: s_mov_b32 s17, 0x40400000
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_add_i32 s5, s4, 1
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 14
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 9
+; GENERIC-NEXT: s_add_i32 s18, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 12
+; GENERIC-NEXT: s_cselect_b32 s5, s5, 0x41500000
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 13
+; GENERIC-NEXT: s_cselect_b32 s6, 0x41880000, s6
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 14
+; GENERIC-NEXT: v_mov_b32_e32 v0, s5
+; GENERIC-NEXT: s_cselect_b32 s7, 0x41880000, s7
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 15
+; GENERIC-NEXT: v_mov_b32_e32 v1, s6
+; GENERIC-NEXT: s_cselect_b32 s8, 0x41880000, s8
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 8
+; GENERIC-NEXT: v_mov_b32_e32 v2, s7
+; GENERIC-NEXT: s_cselect_b32 s9, 0x41880000, s9
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 9
+; GENERIC-NEXT: v_mov_b32_e32 v3, s8
; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 10
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 5
-; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 1
-; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc
-; GENERIC-NEXT: s_cmp_eq_u32 s5, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc
+; GENERIC-NEXT: s_cselect_b32 s10, 0x41880000, s10
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 10
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v0, s9
+; GENERIC-NEXT: s_cselect_b32 s11, 0x41880000, s11
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 11
+; GENERIC-NEXT: v_mov_b32_e32 v1, s10
+; GENERIC-NEXT: s_cselect_b32 s12, 0x41880000, s12
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 4
+; GENERIC-NEXT: v_mov_b32_e32 v2, s11
+; GENERIC-NEXT: s_cselect_b32 s13, 0x41880000, s13
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 5
+; GENERIC-NEXT: v_mov_b32_e32 v3, s12
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b32 s14, 0x41880000, s14
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 6
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v0, s13
+; GENERIC-NEXT: s_cselect_b32 s15, 0x41880000, s15
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 7
+; GENERIC-NEXT: v_mov_b32_e32 v1, s14
+; GENERIC-NEXT: s_cselect_b32 s16, 0x41880000, s16
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 0
+; GENERIC-NEXT: v_mov_b32_e32 v2, s15
+; GENERIC-NEXT: s_cselect_b32 s19, 0x41880000, 1.0
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 1
+; GENERIC-NEXT: v_mov_b32_e32 v3, s16
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b32 s20, 0x41880000, 2.0
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 2
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v0, s19
+; GENERIC-NEXT: s_cselect_b32 s17, 0x41880000, s17
+; GENERIC-NEXT: s_cmp_eq_u32 s18, 3
+; GENERIC-NEXT: v_mov_b32_e32 v1, s20
+; GENERIC-NEXT: s_cselect_b32 s18, 0x41880000, 4.0
; GENERIC-NEXT: s_add_i32 s4, s4, 2
+; GENERIC-NEXT: v_mov_b32_e32 v2, s17
; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
-; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v3, s18
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_cselect_b32 s18, s18, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc
+; GENERIC-NEXT: s_cselect_b32 s17, s17, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v3, s18
+; GENERIC-NEXT: s_cselect_b32 s18, s20, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v2, s17
+; GENERIC-NEXT: s_cselect_b32 s17, s19, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v1, s18
+; GENERIC-NEXT: s_cselect_b32 s16, s16, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v0, s17
+; GENERIC-NEXT: s_cselect_b32 s15, s15, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v7, s16
+; GENERIC-NEXT: s_cselect_b32 s14, s14, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s15
+; GENERIC-NEXT: s_cselect_b32 s13, s13, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s14
+; GENERIC-NEXT: s_cselect_b32 s12, s12, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
-; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v4, s13
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; GENERIC-NEXT: s_cselect_b32 s11, s11, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s12
+; GENERIC-NEXT: s_cselect_b32 s10, s10, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v6, s11
+; GENERIC-NEXT: s_cselect_b32 s9, s9, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v5, s10
+; GENERIC-NEXT: s_cselect_b32 s8, s8, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; GENERIC-NEXT: v_mov_b32_e32 v4, s9
+; GENERIC-NEXT: s_cselect_b32 s7, s7, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; GENERIC-NEXT: s_cselect_b32 s6, s6, 0x41880000
; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
+; GENERIC-NEXT: s_cselect_b32 s4, s5, 0x41880000
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s8
+; GENERIC-NEXT: v_mov_b32_e32 v6, s7
+; GENERIC-NEXT: v_mov_b32_e32 v5, s6
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GENERIC-NEXT: s_endpgm
;
; NOOPT-LABEL: insert_w_offset_multiple_in_block:
@@ -7712,19 +7658,17 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) {
; GENERIC-LABEL: multi_same_block:
; GENERIC: ; %bb.0: ; %bb
; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9
-; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41900000
-; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41b0cccd
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_add_i32 s2, s0, -16
-; GENERIC-NEXT: s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e64 v0, v0, 4.0, s[0:1]
-; GENERIC-NEXT: s_cmp_eq_u32 s2, 5
-; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e64 v1, v1, -4.0, s[0:1]
+; GENERIC-NEXT: s_add_i32 s0, s0, -16
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 1
+; GENERIC-NEXT: s_cselect_b32 s1, 4.0, 0x41900000
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 5
+; GENERIC-NEXT: s_cselect_b32 s0, -4.0, 0x41b0cccd
+; GENERIC-NEXT: v_mov_b32_e32 v0, s1
; GENERIC-NEXT: s_mov_b32 m0, -1
; GENERIC-NEXT: ds_write_b32 v0, v0
-; GENERIC-NEXT: ds_write_b32 v0, v1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s0
+; GENERIC-NEXT: ds_write_b32 v0, v0
; GENERIC-NEXT: s_endpgm
;
; NOOPT-LABEL: multi_same_block:
@@ -8689,85 +8633,68 @@ entry:
define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
; GENERIC-LABEL: insertelement_v16f32_or_index:
; GENERIC: ; %bb.0:
-; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9
; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29
-; GENERIC-NEXT: s_mov_b32 s3, 0xf000
-; GENERIC-NEXT: s_mov_b32 s2, -1
-; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40a00000
+; GENERIC-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19
+; GENERIC-NEXT: s_mov_b32 s19, 0xf000
+; GENERIC-NEXT: s_mov_b32 s18, -1
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
; GENERIC-NEXT: s_lshl_b32 s20, s20, 2
-; GENERIC-NEXT: v_mov_b32_e32 v0, s7
-; GENERIC-NEXT: v_mov_b32_e32 v1, s6
-; GENERIC-NEXT: v_mov_b32_e32 v4, s5
-; GENERIC-NEXT: v_mov_b32_e32 v5, s4
-; GENERIC-NEXT: v_mov_b32_e32 v6, s11
-; GENERIC-NEXT: v_mov_b32_e32 v8, s10
-; GENERIC-NEXT: v_mov_b32_e32 v9, s9
-; GENERIC-NEXT: v_mov_b32_e32 v11, s8
-; GENERIC-NEXT: v_mov_b32_e32 v12, s15
-; GENERIC-NEXT: v_mov_b32_e32 v13, s14
-; GENERIC-NEXT: v_mov_b32_e32 v14, s13
-; GENERIC-NEXT: v_mov_b32_e32 v15, s12
-; GENERIC-NEXT: v_mov_b32_e32 v16, s19
-; GENERIC-NEXT: v_mov_b32_e32 v17, s18
-; GENERIC-NEXT: v_mov_b32_e32 v18, s17
-; GENERIC-NEXT: v_mov_b32_e32 v19, s16
-; GENERIC-NEXT: s_or_b32 s4, s20, 1
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 1
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 0
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v5, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v6, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v8, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
-; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_or_b32 s20, s20, 1
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 3
+; GENERIC-NEXT: s_cselect_b32 s3, s3, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 2
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 1
+; GENERIC-NEXT: v_mov_b32_e32 v3, s3
+; GENERIC-NEXT: s_cselect_b32 s1, s1, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 0
+; GENERIC-NEXT: v_mov_b32_e32 v2, s2
+; GENERIC-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 7
+; GENERIC-NEXT: v_mov_b32_e32 v1, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s7, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 6
+; GENERIC-NEXT: v_mov_b32_e32 v0, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s6, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 5
+; GENERIC-NEXT: v_mov_b32_e32 v7, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s5, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 4
+; GENERIC-NEXT: v_mov_b32_e32 v6, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s4, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 11
+; GENERIC-NEXT: v_mov_b32_e32 v5, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s11, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s0
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GENERIC-NEXT: s_cselect_b32 s0, s10, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 9
; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v15, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v9, v10, v16, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
-; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_mov_b32_e32 v7, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s9, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 8
+; GENERIC-NEXT: v_mov_b32_e32 v6, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s8, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 15
+; GENERIC-NEXT: v_mov_b32_e32 v5, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s15, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 14
+; GENERIC-NEXT: v_mov_b32_e32 v4, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s14, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 13
+; GENERIC-NEXT: s_cselect_b32 s2, s13, 0x40a00000
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 12
+; GENERIC-NEXT: s_cselect_b32 s3, s12, 0x40a00000
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32
; GENERIC-NEXT: s_waitcnt expcnt(0)
-; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v17, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v18, vcc
-; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
-; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
-; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
-; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: v_mov_b32_e32 v7, s1
+; GENERIC-NEXT: v_mov_b32_e32 v6, s0
+; GENERIC-NEXT: v_mov_b32_e32 v5, s2
+; GENERIC-NEXT: v_mov_b32_e32 v4, s3
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; GENERIC-NEXT: s_endpgm
;
; NOOPT-LABEL: insertelement_v16f32_or_index:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index ea18e0d9eeefbd..5c96dca37aa19a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -9,22 +9,18 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s8, 3
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s2, s7, 1.0
; GCN-NEXT: s_cmp_lg_u32 s8, 2
-; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s3, s6, 1.0
; GCN-NEXT: s_cmp_lg_u32 s8, 1
-; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s5, s5, 1.0
; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, 1.0
; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -89,14 +85,12 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 1
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s2, s5, 1.0
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_cselect_b32 s3, s4, 1.0
; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 213813a94fc859..61846a29b1daed 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -499,18 +499,15 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2
; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
; SI-NEXT: s_mov_b32 s7, 0x100f000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 1
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: s_cselect_b32 s1, s1, 0x40a00000
; SI-NEXT: s_cmp_lg_u32 s2, 0
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v2, s0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -518,18 +515,15 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
; VI-NEXT: s_mov_b32 s7, 0x1100f000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 1
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
+; VI-NEXT: s_cselect_b32 s1, s1, 0x40a00000
; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
@@ -541,49 +535,41 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3
; SI-LABEL: dynamic_insertelement_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[6:7], 0x8
-; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4
-; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
-; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4
+; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT: s_mov_b32 s7, 0x100f000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s8, 2
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: s_cselect_b32 s2, s2, 0x40a00000
; SI-NEXT: s_cmp_lg_u32 s8, 1
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: s_cselect_b32 s1, s1, 0x40a00000
; SI-NEXT: s_cmp_lg_u32 s8, 0
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v3, s4
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; SI-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s8, s[6:7], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10
-; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0x1100f000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s8, 2
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: s_cselect_b32 s2, s2, 0x40a00000
; VI-NEXT: s_cmp_lg_u32 s8, 1
-; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
+; VI-NEXT: s_cselect_b32 s1, s1, 0x40a00000
; VI-NEXT: s_cmp_lg_u32 s8, 0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
store <3 x float> %vecins, ptr addrspace(1) %out, align 16
@@ -594,57 +580,47 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4
; SI-LABEL: dynamic_insertelement_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[6:7], 0x8
-; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4
-; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
-; SI-NEXT: s_mov_b32 s3, 0x100f000
+; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4
+; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT: s_mov_b32 s7, 0x100f000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s8, 3
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: s_cselect_b32 s3, s3, 0x40a00000
; SI-NEXT: s_cmp_lg_u32 s8, 2
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v1, s6
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: s_cselect_b32 s2, s2, 0x40a00000
; SI-NEXT: s_cmp_lg_u32 s8, 1
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: s_cselect_b32 s1, s1, 0x40a00000
; SI-NEXT: s_cmp_lg_u32 s8, 0
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v4, s4
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: dynamic_insertelement_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s8, s[6:7], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10
-; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10
+; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0x1100f000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s8, 3
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_cselect_b32 s3, s3, 0x40a00000
; VI-NEXT: s_cmp_lg_u32 s8, 2
-; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
+; VI-NEXT: s_cselect_b32 s2, s2, 0x40a00000
; VI-NEXT: s_cmp_lg_u32 s8, 1
-; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
+; VI-NEXT: s_cselect_b32 s1, s1, 0x40a00000
; VI-NEXT: s_cmp_lg_u32 s8, 0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_cselect_b32 s0, s0, 0x40a00000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
store <4 x float> %vecins, ptr addrspace(1) %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 32b599e63c61d2..bb327b2370dbdb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -15,17 +15,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x1f800000, 1.0
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
; SI-SDAG-NEXT: s_mov_b32 s2, -1
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
@@ -53,15 +53,15 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -91,17 +91,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp2_f32:
@@ -175,25 +175,27 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s6, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x42800000, 0
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1
-; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; SI-SDAG-NEXT: v_exp_f32_e32 v3, v1
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
; SI-SDAG-NEXT: s_mov_b32 s4, s0
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_add_f32_e32 v1, s3, v1
+; SI-SDAG-NEXT: s_cselect_b32 s3, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: s_mov_b32 s5, s1
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-SDAG-NEXT: s_endpgm
;
@@ -223,25 +225,27 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
;
; VI-SDAG-LABEL: s_exp2_v2f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; VI-SDAG-NEXT: s_cselect_b32 s3, 0x42800000, 0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: v_add_f32_e32 v0, s7, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
@@ -273,23 +277,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: s_cselect_b32 s3, 0x42800000, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s7, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX900-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp2_v2f32:
@@ -384,29 +390,33 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x42800000, 0
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_add_f32_e32 v1, s5, v1
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, s5
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; SI-SDAG-NEXT: v_add_f32_e32 v4, s5, v4
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, s4, v2
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
; SI-SDAG-NEXT: v_add_f32_e32 v0, s6, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v6, s4, v6
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_exp_f32_e32 v3, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc
-; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s2, -1
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, v3, v7
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, v6, v5
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v2
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v3
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
@@ -446,31 +456,35 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_exp2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0
+; VI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s7, 0x1f800000, 1.0
+; VI-SDAG-NEXT: s_cselect_b32 s8, 0x42800000, 0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v4, s6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v6, s5, v6
-; VI-SDAG-NEXT: v_exp_f32_e32 v3, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v3, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v3
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -510,29 +524,33 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s7, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s6, v4
-; GFX900-SDAG-NEXT: v_add_f32_e32 v6, s5, v6
-; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v6, v6
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v1
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0
-; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT: s_cselect_b32 s6, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v2
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v4
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp2_v3f32:
@@ -656,38 +674,44 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-SDAG-LABEL: s_exp2_v4f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; SI-SDAG-NEXT: v_add_f32_e32 v1, s7, v1
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x1f800000, 1.0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, s7, v1
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s7
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0x42800000, 0
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8
-; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6
-; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8
-; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1
-; SI-SDAG-NEXT: s_mov_b32 s2, -1
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0
+; SI-SDAG-NEXT: s_cselect_b32 s9, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x42800000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s5
+; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_exp_f32_e32 v4, v2
+; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x1f800000, 1.0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s8, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
@@ -731,37 +755,43 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_exp2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
+; VI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s9, 0x42800000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s7, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; VI-SDAG-NEXT: s_cselect_b32 s8, 0x1f800000, 1.0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s7, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v8, v8
-; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v4, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v4
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -805,37 +835,43 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-LABEL: s_exp2_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42800000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s7, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s3, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s2, v1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX900-SDAG-NEXT: v_add_f32_e32 v5, s7, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v7, s6, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v9, s5, v9
-; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v5
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v7, v7
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v9, v9
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v10, v1
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, v5, v2
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x42800000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v2
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x1f800000, 1.0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v5
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp2_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 7f4cf19e9b85b4..b251905c8a4f9b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -14,30 +14,30 @@
define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; SI-SDAG-LABEL: s_log_f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb
-; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dword s6, s[2:3], 0xb
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
+; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217
+; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf
+; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1
-; SI-SDAG-NEXT: s_mov_b32 s0, 0x3377d1cf
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, v2
-; SI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000
+; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1
+; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2
; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x41b17218, 0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log_f32:
@@ -70,16 +70,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_log_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
@@ -89,13 +90,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
; VI-SDAG-NEXT: v_add_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
@@ -132,29 +133,29 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
;
; GFX900-SDAG-LABEL: s_log_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf
+; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2
-; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s1, v3
-; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x7f800000
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217
+; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf
+; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s2, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s3, v3
; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log_f32:
@@ -188,11 +189,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s0, s1
; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -201,13 +204,12 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
+; GFX1100-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -316,43 +318,45 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-SDAG-LABEL: s_log_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf
+; SI-SDAG-NEXT: s_mov_b32 s8, 0x3f317217
; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT: s_mov_b32 s3, 0x3f317217
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s3, -v3
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s8, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s9
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: s_cselect_b32 s10, 0x41b17218, 0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1
+; SI-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s8, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s7, v3
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0
-; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v2
-; SI-SDAG-NEXT: v_fma_f32 v4, v0, s8, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT: v_fma_f32 v3, v0, s8, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v0, s7, v3
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x41b17218, 0
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s10, v1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log_v2f32:
@@ -396,48 +400,50 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
;
; VI-SDAG-LABEL: s_log_v2f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s2
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
-; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x41b17218, 0
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: v_and_b32_e32 v0, 0xfffff000, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317000, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
+; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s6, v0
+; VI-SDAG-NEXT: v_and_b32_e32 v0, 0xfffff000, v2
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317000, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
@@ -492,37 +498,39 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217
; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s8, 0x41b17218, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317217, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s2, -v0
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s3, v3
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3
; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x7f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s2, -v4
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s3, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s7
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v3, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v3
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s7
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s8, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317217, v3
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v3, s2, -v0
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v3, s3, v4
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s7
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x41b17218, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX900-SDAG-NEXT: s_endpgm
;
@@ -570,30 +578,34 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s3, s4
+; GFX1100-SDAG-NEXT: s_and_b32 s5, s5, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s2, s5
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1
; GFX1100-SDAG-NEXT: v_fma_f32 v4, 0x3f317217, v0, -v2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3
; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v1, s6, v0 :: v_dual_subrev_f32 v0, s2, v2
; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -747,55 +759,59 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; SI-SDAG-LABEL: s_log_v3f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
-; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000
+; SI-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf
+; SI-SDAG-NEXT: s_mov_b32 s11, 0x7f800000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s11, 0x3377d1cf
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v1
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT: s_mov_b32 s9, 0x3f317217
-; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1
-; SI-SDAG-NEXT: v_fma_f32 v4, v1, s9, -v3
-; SI-SDAG-NEXT: v_fma_f32 v4, v1, s11, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v3
+; SI-SDAG-NEXT: s_mov_b32 s7, 0x3f317217
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x41b17218, 0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s7, -v2
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s10, v3
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_mov_b32_e32 v3, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, s4, v3
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5
-; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v3
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0
-; SI-SDAG-NEXT: v_fma_f32 v6, v3, s9, -v5
-; SI-SDAG-NEXT: v_log_f32_e32 v2, v0
-; SI-SDAG-NEXT: v_fma_f32 v6, v3, s11, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s12
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[2:3]
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2
-; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v3
-; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s12
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x41b17218, 0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v3
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_fma_f32 v4, v3, s7, -v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT: v_fma_f32 v4, v3, s10, v4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT: v_log_f32_e32 v4, v0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x41b17218, 0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v4
+; SI-SDAG-NEXT: v_fma_f32 v3, v4, s7, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v4, s10, v3
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v2, s4, v2
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s8, v0
+; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log_v3f32:
@@ -855,65 +871,69 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24
-; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s8
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_cselect_b32 s7, 0x41b17218, 0
+; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
+; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
; VI-SDAG-NEXT: v_mul_f32_e32 v3, s5, v3
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5
-; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, v3, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-SDAG-NEXT: s_cselect_b32 s5, 0x41b17218, 0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s7, v1
+; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v4, v3, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v1
; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s8
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3]
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v5, v0, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
+; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s8
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -985,52 +1005,56 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-SDAG-LABEL: s_log_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf
+; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3f317217
+; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x7f800000
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3f317217
-; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s6, -v4
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s10
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v1, s[0:1]
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3377d1cf
+; GFX900-SDAG-NEXT: s_cselect_b32 s9, 0x41b17218, 0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s7, -v2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s6, v4
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s8
; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
-; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v4
+; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4
+; GFX900-SDAG-NEXT: s_cselect_b32 s5, 0x41b17218, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s6, -v6
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v2, s9, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v4
; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s7, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v6, s[2:3]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v4, s7, -v1
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v4, s6, v5
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s8
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s6, -v4
-; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s7, v6
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s10
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
-; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s7, -v4
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s6, v5
+; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s8
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log_v3f32:
@@ -1086,55 +1110,52 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
;
; GFX1100-SDAG-LABEL: s_log_v3f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s8, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s6, s0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s5, s1
+; GFX1100-SDAG-NEXT: s_cselect_b32 s9, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s4, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v2, s4, s0
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1
+; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, 0x3f317217, v0 :: v_dual_mul_f32 v4, 0x3f317217, v1
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3f317217, v0, -v3
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3f317217, v1, -v4
-; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v6, 0x3377d1cf, v0
+; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_fmac_f32 v7, 0x3377d1cf, v1
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_add_f32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
-; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_cndmask_b32 v1, v1, v4
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_mov_b32 v4, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_add_f32_e32 v5, v5, v8
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v1, v10
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc_lo
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
+; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_subrev_f32 v1, s9, v1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v3, v2, v5 :: v_dual_subrev_f32 v2, s8, v0
+; GFX1100-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v3
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1353,66 +1374,72 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-SDAG-LABEL: s_log_v4f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
-; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf
+; SI-SDAG-NEXT: s_mov_b32 s10, 0x3f317217
+; SI-SDAG-NEXT: s_mov_b32 s11, 0x3377d1cf
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT: s_mov_b32 s11, 0x3f317217
-; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s11, -v3
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s12, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s13
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x41b17218, 0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s10, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, v3
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_mov_b32_e32 v3, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; SI-SDAG-NEXT: v_log_f32_e32 v4, v3
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v4
-; SI-SDAG-NEXT: v_fma_f32 v6, v4, s11, -v2
-; SI-SDAG-NEXT: v_fma_f32 v6, v4, s12, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6
-; SI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s13
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v6
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0
-; SI-SDAG-NEXT: v_fma_f32 v7, v6, s11, -v4
+; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s12
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-SDAG-NEXT: v_subrev_f32_e32 v3, s7, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v4
+; SI-SDAG-NEXT: v_fma_f32 v2, v4, s10, -v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT: v_fma_f32 v2, v4, s11, v2
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x41b17218, 0
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s12
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x41b17218, 0
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_log_f32_e32 v5, v2
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_fma_f32 v7, v6, s12, v7
-; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s13
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3]
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4
+; SI-SDAG-NEXT: v_subrev_f32_e32 v2, s8, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v5
+; SI-SDAG-NEXT: v_fma_f32 v4, v5, s10, -v1
+; SI-SDAG-NEXT: v_fma_f32 v4, v5, s11, v4
+; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s12
; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0
-; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v4
-; SI-SDAG-NEXT: v_fma_f32 v6, v0, s12, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s13
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-SDAG-NEXT: v_fma_f32 v5, v0, s10, -v4
+; SI-SDAG-NEXT: v_fma_f32 v5, v0, s11, v5
+; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s12
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
-; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x41b17218, 0
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log_v4f32:
@@ -1482,82 +1509,89 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-LABEL: s_log_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT: s_mov_b32 s7, 0x7f800000
-; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s7
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_cselect_b32 s8, 0x41b17218, 0
+; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; VI-SDAG-NEXT: v_log_f32_e32 v4, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, v4, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-SDAG-NEXT: s_mov_b32 s7, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s7
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-SDAG-NEXT: v_subrev_f32_e32 v3, s8, v1
+; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v4
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v1
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6
-; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s7
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v7, v6, v4
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317000, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v6, v2
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x41b17218, 0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s7
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-SDAG-NEXT: s_cselect_b32 s5, 0x41b17218, 0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s6, v1
+; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT: v_sub_f32_e32 v4, v5, v1
; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s7
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3]
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v7, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s7
; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v5, v0, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1646,64 +1680,70 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-LABEL: s_log_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf
+; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x3f317217
+; GFX900-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3f317217
-; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x41b17218
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, -v3
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s10, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s11
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf
+; GFX900-SDAG-NEXT: s_cselect_b32 s10, 0x41b17218, 0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s8, -v2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s7, v3
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s2
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v3
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s9
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v3, s10, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v5
+; GFX900-SDAG-NEXT: v_fma_f32 v2, v5, s8, -v1
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s7, -v2
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s10, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s5, v7
-; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s11
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v7
+; GFX900-SDAG-NEXT: v_fma_f32 v2, v5, s7, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s6, 0x41b17218, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s5, 0x41b17218, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s7, -v5
; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s10, v8
-; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v8
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s11
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v2, s6, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v6
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v6, s8, -v1
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v6, s7, v5
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s9
; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s7, -v5
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s10, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v7
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s8, -v5
+; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s7, v6
+; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log_v4f32:
@@ -1771,61 +1811,63 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
;
; GFX1100-SDAG-LABEL: s_log_v4f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s10, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s7, s0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s11, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s8, s9, exec_lo
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s6, s1
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v2, s5, s0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v3, s4, s1
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x41b17218, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, 0x3f317217, v0 :: v_dual_mul_f32 v5, 0x3f317217, v1
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3f317217, v0, -v5
-; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3f317217, v1, -v6
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7
-; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8
-; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, 0x3f317217, v2 :: v_dual_mul_f32 v7, 0x3f317217, v3
+; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3f317217, v0, -v4
+; GFX1100-SDAG-NEXT: v_fma_f32 v9, 0x3f317217, v1, -v5
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3f317217, v2, -v6
+; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3f317217, v3, -v7
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v8, 0x3377d1cf, v0 :: v_dual_fmac_f32 v9, 0x3377d1cf, v1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v12, 0
+; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3377d1cf, v2 :: v_dual_fmac_f32 v11, 0x3377d1cf, v3
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v5, v5, v9
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v7, v7, v11 :: v_dual_cndmask_b32 v0, v0, v4
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v6, v6, v10 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc_lo
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc_lo
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-SDAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc_lo
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc_lo
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v3, s10, v0 :: v_dual_subrev_f32 v2, s11, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v1, s7, v4 :: v_dual_subrev_f32 v0, s2, v5
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b128 v12, v[0:3], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 1c64e6b76c9577..fdcacb2effd4d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -14,30 +14,30 @@
define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; SI-SDAG-LABEL: s_log10_f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb
-; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dword s6, s[2:3], 0xb
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
+; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a
+; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf
+; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1
-; SI-SDAG-NEXT: s_mov_b32 s0, 0x3284fbcf
-; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, v2
-; SI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000
+; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1
+; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2
; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x411a209b, 0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log10_f32:
@@ -70,16 +70,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_log10_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
@@ -89,13 +90,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
; VI-SDAG-NEXT: v_add_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
@@ -132,29 +133,29 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
;
; GFX900-SDAG-LABEL: s_log10_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf
+; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2
-; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s1, v3
-; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x7f800000
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a
+; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf
+; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s2, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s3, v3
; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log10_f32:
@@ -188,11 +189,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s0, s1
; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -201,13 +204,12 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
+; GFX1100-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -316,43 +318,45 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-SDAG-LABEL: s_log10_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf
+; SI-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a
; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT: s_mov_b32 s3, 0x3e9a209a
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s3, -v3
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s8, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s9
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: s_cselect_b32 s10, 0x411a209b, 0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1
+; SI-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s8, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s7, v3
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s9
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0
-; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v2
-; SI-SDAG-NEXT: v_fma_f32 v4, v0, s8, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT: v_fma_f32 v3, v0, s8, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v0, s7, v3
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x411a209b, 0
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s10, v1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log10_v2f32:
@@ -396,48 +400,50 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
;
; VI-SDAG-LABEL: s_log10_v2f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s2
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
-; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x411a209b, 0
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: v_and_b32_e32 v0, 0xfffff000, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a2000, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
+; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s6, v0
+; VI-SDAG-NEXT: v_and_b32_e32 v0, 0xfffff000, v2
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a2000, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
@@ -492,37 +498,39 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a
; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s8, 0x411a209b, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209a, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s2, -v0
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s3, v3
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3
; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x7f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s2, -v4
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s3, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s7
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v3, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v3
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s7
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s8, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209a, v3
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v3, s2, -v0
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v3, s3, v4
+; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s7
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x411a209b, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX900-SDAG-NEXT: s_endpgm
;
@@ -570,30 +578,34 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s3, s4
+; GFX1100-SDAG-NEXT: s_and_b32 s5, s5, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s2, s5
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1
; GFX1100-SDAG-NEXT: v_fma_f32 v4, 0x3e9a209a, v0, -v2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3
; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v1, s6, v0 :: v_dual_subrev_f32 v0, s2, v2
; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -747,55 +759,59 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; SI-SDAG-LABEL: s_log10_v3f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
-; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000
+; SI-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf
+; SI-SDAG-NEXT: s_mov_b32 s11, 0x7f800000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s11, 0x3284fbcf
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v1
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT: s_mov_b32 s9, 0x3e9a209a
-; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1
-; SI-SDAG-NEXT: v_fma_f32 v4, v1, s9, -v3
-; SI-SDAG-NEXT: v_fma_f32 v4, v1, s11, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v3
+; SI-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x411a209b, 0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s7, -v2
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s10, v3
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_mov_b32_e32 v3, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, s4, v3
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5
-; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v3
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0
-; SI-SDAG-NEXT: v_fma_f32 v6, v3, s9, -v5
-; SI-SDAG-NEXT: v_log_f32_e32 v2, v0
-; SI-SDAG-NEXT: v_fma_f32 v6, v3, s11, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s12
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[2:3]
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2
-; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v3
-; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s12
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x411a209b, 0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v3
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_fma_f32 v4, v3, s7, -v2
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT: v_fma_f32 v4, v3, s10, v4
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT: v_log_f32_e32 v4, v0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x411a209b, 0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v4
+; SI-SDAG-NEXT: v_fma_f32 v3, v4, s7, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v4, s10, v3
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s11
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v2, s4, v2
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s8, v0
+; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log10_v3f32:
@@ -855,65 +871,69 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24
-; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s8
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_cselect_b32 s7, 0x411a209b, 0
+; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
+; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000
; VI-SDAG-NEXT: v_mul_f32_e32 v3, s5, v3
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5
-; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, v3, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-SDAG-NEXT: s_cselect_b32 s5, 0x411a209b, 0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s7, v1
+; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v4, v3, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v1
; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s8
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3]
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v5, v0, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
+; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s8
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -985,52 +1005,56 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-SDAG-LABEL: s_log10_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf
+; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a
+; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x7f800000
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3e9a209a
-; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s6, -v4
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s10
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v1, s[0:1]
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3284fbcf
+; GFX900-SDAG-NEXT: s_cselect_b32 s9, 0x411a209b, 0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s7, -v2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s6, v4
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s8
; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
-; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v4
+; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4
+; GFX900-SDAG-NEXT: s_cselect_b32 s5, 0x411a209b, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s6, -v6
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v2, s9, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v4
; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s7, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v6, s[2:3]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v4, s7, -v1
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v4, s6, v5
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s8
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s6, -v4
-; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s7, v6
-; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s10
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
-; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s7, -v4
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s6, v5
+; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s8
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log10_v3f32:
@@ -1086,55 +1110,52 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log10_v3f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s8, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s6, s0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s5, s1
+; GFX1100-SDAG-NEXT: s_cselect_b32 s9, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s4, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v2, s4, s0
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1
+; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, 0x3e9a209a, v0 :: v_dual_mul_f32 v4, 0x3e9a209a, v1
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3e9a209a, v0, -v3
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3e9a209a, v1, -v4
-; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v6, 0x3284fbcf, v0
+; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_fmac_f32 v7, 0x3284fbcf, v1
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v2
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_add_f32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
-; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_cndmask_b32 v1, v1, v4
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_mov_b32 v4, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_add_f32_e32 v5, v5, v8
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v1, v10
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc_lo
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
+; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_subrev_f32 v1, s9, v1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v3, v2, v5 :: v_dual_subrev_f32 v2, s8, v0
+; GFX1100-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v3
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1353,66 +1374,72 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-SDAG-LABEL: s_log10_v4f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
-; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf
+; SI-SDAG-NEXT: s_mov_b32 s10, 0x3e9a209a
+; SI-SDAG-NEXT: s_mov_b32 s11, 0x3284fbcf
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT: s_mov_b32 s11, 0x3e9a209a
-; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s11, -v3
-; SI-SDAG-NEXT: v_fma_f32 v4, v2, s12, v4
-; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s13
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
-; SI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x411a209b, 0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s10, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, v3
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT: v_mov_b32_e32 v3, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; SI-SDAG-NEXT: v_log_f32_e32 v4, v3
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v4
-; SI-SDAG-NEXT: v_fma_f32 v6, v4, s11, -v2
-; SI-SDAG-NEXT: v_fma_f32 v6, v4, s12, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6
-; SI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s13
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v6
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0
-; SI-SDAG-NEXT: v_fma_f32 v7, v6, s11, -v4
+; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s12
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-SDAG-NEXT: v_subrev_f32_e32 v3, s7, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v4
+; SI-SDAG-NEXT: v_fma_f32 v2, v4, s10, -v1
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT: v_fma_f32 v2, v4, s11, v2
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x411a209b, 0
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s12
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x411a209b, 0
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_log_f32_e32 v5, v2
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_fma_f32 v7, v6, s12, v7
-; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s13
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3]
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4
+; SI-SDAG-NEXT: v_subrev_f32_e32 v2, s8, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v5
+; SI-SDAG-NEXT: v_fma_f32 v4, v5, s10, -v1
+; SI-SDAG-NEXT: v_fma_f32 v4, v5, s11, v4
+; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s12
; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0
-; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v4
-; SI-SDAG-NEXT: v_fma_f32 v6, v0, s12, v6
-; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
-; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s13
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-SDAG-NEXT: v_fma_f32 v5, v0, s10, -v4
+; SI-SDAG-NEXT: v_fma_f32 v5, v0, s11, v5
+; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s12
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
-; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x411a209b, 0
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log10_v4f32:
@@ -1482,82 +1509,89 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_log10_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT: s_mov_b32 s7, 0x7f800000
-; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s7
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_cselect_b32 s8, 0x411a209b, 0
+; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; VI-SDAG-NEXT: v_log_f32_e32 v4, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, v4, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-SDAG-NEXT: s_mov_b32 s7, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s7
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-SDAG-NEXT: v_subrev_f32_e32 v3, s8, v1
+; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v4
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v1
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6
-; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s7
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v7, v6, v4
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v6, v2
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x411a209b, 0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s7
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-SDAG-NEXT: s_cselect_b32 s5, 0x411a209b, 0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s6, v1
+; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT: v_sub_f32_e32 v4, v5, v1
; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s7
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3]
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v7, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s7
; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v0
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v5, v0, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5
+; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1646,64 +1680,70 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-LABEL: s_log10_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf
+; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a
+; GFX900-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a
-; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x411a209b
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, -v3
-; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s10, v5
-; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s11
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf
+; GFX900-SDAG-NEXT: s_cselect_b32 s10, 0x411a209b, 0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s8, -v2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s7, v3
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s2
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3
; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v3
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s9
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v3, s10, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v5
+; GFX900-SDAG-NEXT: v_fma_f32 v2, v5, s8, -v1
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s7, -v2
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s10, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s5, v7
-; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s11
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1]
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v7
+; GFX900-SDAG-NEXT: v_fma_f32 v2, v5, s7, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s6, 0x411a209b, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s5, 0x411a209b, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v2
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s7, -v5
; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s10, v8
-; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v8
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s11
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3]
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v2, s6, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v6
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v6, s8, -v1
+; GFX900-SDAG-NEXT: v_fma_f32 v5, v6, s7, v5
+; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s9
; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s7, -v5
-; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s10, v7
-; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v7
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s8, -v5
+; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s7, v6
+; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s5, v1
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log10_v4f32:
@@ -1771,61 +1811,63 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log10_v4f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s10, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s7, s0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s11, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_and_b32 s8, s9, exec_lo
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s6, s1
+; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v2, s5, s0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v3, s4, s1
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x411a209b, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, 0x3e9a209a, v0 :: v_dual_mul_f32 v5, 0x3e9a209a, v1
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3e9a209a, v0, -v5
-; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3e9a209a, v1, -v6
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7
-; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8
-; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, 0x3e9a209a, v2 :: v_dual_mul_f32 v7, 0x3e9a209a, v3
+; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3e9a209a, v0, -v4
+; GFX1100-SDAG-NEXT: v_fma_f32 v9, 0x3e9a209a, v1, -v5
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3e9a209a, v2, -v6
+; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3e9a209a, v3, -v7
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v8, 0x3284fbcf, v0 :: v_dual_fmac_f32 v9, 0x3284fbcf, v1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v12, 0
+; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3284fbcf, v2 :: v_dual_fmac_f32 v11, 0x3284fbcf, v3
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v5, v5, v9
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v7, v7, v11 :: v_dual_cndmask_b32 v0, v0, v4
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_dual_add_f32 v6, v6, v10 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc_lo
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc_lo
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-SDAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc_lo
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc_lo
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v3, s10, v0 :: v_dual_subrev_f32 v2, s11, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v1, s7, v4 :: v_dual_subrev_f32 v0, s2, v5
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b128 v12, v[0:3], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 50c52037dc4d31..4d014d3126b9a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -17,17 +17,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x42000000, 0
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
; SI-SDAG-NEXT: s_mov_b32 s2, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
@@ -55,15 +55,15 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -93,17 +93,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log2_f32:
@@ -128,20 +128,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-LABEL: s_log2_f32:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c
-; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s4, s5
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX1100-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_subrev_f32 v0, s2, v0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -218,25 +217,27 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s6, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
-; SI-SDAG-NEXT: v_log_f32_e32 v4, v4
-; SI-SDAG-NEXT: v_log_f32_e32 v3, v1
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
; SI-SDAG-NEXT: s_mov_b32 s4, s0
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1
+; SI-SDAG-NEXT: s_cselect_b32 s3, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: s_mov_b32 s5, s1
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s0, v1
+; SI-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-SDAG-NEXT: s_endpgm
;
@@ -266,25 +267,27 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
;
; VI-SDAG-LABEL: s_log2_v2f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v4, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; VI-SDAG-NEXT: s_cselect_b32 s3, 0x4f800000, 1.0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s7, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
; VI-SDAG-NEXT: v_log_f32_e32 v2, v1
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s2, v0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
@@ -316,23 +319,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX900-SDAG-NEXT: s_cselect_b32 s3, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s7, v0
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
-; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4
+; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s2, v0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v2
+; GFX900-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log2_v2f32:
@@ -361,24 +366,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX1100-SDAG-LABEL: s_log2_v2f32:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s5
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_and_b32 s5, s5, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s3, s6
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s2, s5
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
-; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v1, s4, v0 :: v_dual_subrev_f32 v0, s2, v2
+; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -476,29 +482,33 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x42000000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v1
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, s5
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x42000000, 0
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v2
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4
; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v4, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v6, s4, v6
+; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
; SI-SDAG-NEXT: v_log_f32_e32 v3, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
-; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s2, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v7
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v5
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x42000000, 0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s8, v2
+; SI-SDAG-NEXT: v_subrev_f32_e32 v2, s4, v3
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s7, v1
; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
@@ -538,31 +548,35 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_log2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0
+; VI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s7, 0x42000000, 0
+; VI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v4, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6
-; VI-SDAG-NEXT: v_log_f32_e32 v3, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x42000000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v3, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s7, v1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s6, v3
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -602,29 +616,33 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s7, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6
-; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v1
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0
-; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT: s_cselect_b32 s6, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v2
+; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v2, s7, v1
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s6, v4
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log2_v3f32:
@@ -659,33 +677,36 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v3f32:
; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
-; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1
; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s9, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s6, s8
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s5, s9
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v2, s4, s3
+; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v2
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v2, s7, v0 :: v_dual_subrev_f32 v1, s2, v1
+; GFX1100-SDAG-NEXT: v_subrev_f32_e32 v0, s3, v3
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -814,38 +835,44 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-SDAG-LABEL: s_log2_v4f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x42000000, 0
+; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT: v_subrev_f32_e32 v3, s7, v1
+; SI-SDAG-NEXT: s_cselect_b32 s7, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s7
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc
+; SI-SDAG-NEXT: s_cselect_b32 s8, 0x42000000, 0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6
-; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; SI-SDAG-NEXT: v_log_f32_e32 v4, v4
-; SI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; SI-SDAG-NEXT: v_log_f32_e32 v8, v8
-; SI-SDAG-NEXT: v_log_f32_e32 v9, v1
-; SI-SDAG-NEXT: s_mov_b32 s2, -1
-; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2
-; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5
-; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0
+; SI-SDAG-NEXT: s_cselect_b32 s9, 0x42000000, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; SI-SDAG-NEXT: s_cselect_b32 s5, 0x4f800000, 1.0
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s5
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_log_f32_e32 v4, v2
+; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0x42000000, 0
+; SI-SDAG-NEXT: v_subrev_f32_e32 v2, s8, v1
+; SI-SDAG-NEXT: v_subrev_f32_e32 v1, s9, v4
+; SI-SDAG-NEXT: v_subrev_f32_e32 v0, s4, v0
; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
@@ -889,37 +916,43 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_log2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; VI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s9, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; VI-SDAG-NEXT: s_cselect_b32 s8, 0x42000000, 0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc
+; VI-SDAG-NEXT: s_cselect_b32 s7, 0x42000000, 0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v3, s8, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; VI-SDAG-NEXT: v_log_f32_e32 v4, v4
-; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v8, v8
-; VI-SDAG-NEXT: v_log_f32_e32 v9, v1
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT: s_cselect_b32 s6, 0x42000000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v4, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_subrev_f32_e32 v2, s7, v1
+; VI-SDAG-NEXT: v_subrev_f32_e32 v1, s6, v4
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; VI-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -963,37 +996,43 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-LABEL: s_log2_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc
+; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc
+; GFX900-SDAG-NEXT: s_cselect_b32 s3, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v3, s2, v1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, s7, v5
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s6, v7
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v9, s5, v9
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
-; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v5
-; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX900-SDAG-NEXT: v_log_f32_e32 v9, v9
-; GFX900-SDAG-NEXT: v_log_f32_e32 v10, v1
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s5, v2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v2
+; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v2, s3, v1
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v1, s2, v5
+; GFX900-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_log2_v4f32:
@@ -1034,37 +1073,41 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v4f32:
; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
-; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: s_and_b32 s8, s8, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s8, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s10, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s11, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, s7, s10
+; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 0x42000000, 0
+; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: s_and_b32 s9, s9, exec_lo
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v1, s6, s11
+; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 0x4f800000, 1.0
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v2, s5, s7
+; GFX1100-SDAG-NEXT: v_mul_f32_e64 v3, s4, s6
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v2
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v3
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v3, s8, v0 :: v_dual_subrev_f32 v2, s2, v1
+; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 0x42000000, 0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
-; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_subrev_f32 v1, s3, v4 :: v_dual_subrev_f32 v0, s2, v5
+; GFX1100-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index e7b17c30cf7535..6c51b543411df1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -10,16 +10,18 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX6-NEXT: s_brev_b32 s7, -2
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e32 v0, s6
; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX6-NEXT: s_brev_b32 s4, -2
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_bfi_b32 v1, s4, v1, v2
+; GFX6-NEXT: v_bfi_b32 v1, s7, v1, v2
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -28,16 +30,18 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8-NEXT: s_brev_b32 s7, -2
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f32_e32 v0, s6
; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX8-NEXT: s_brev_b32 s4, -2
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
+; GFX8-NEXT: v_bfi_b32 v1, s7, v1, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
@@ -46,16 +50,18 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT: s_brev_b32 s7, -2
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX9-NEXT: s_brev_b32 s4, -2
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT: v_bfi_b32 v1, s7, v1, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@@ -71,10 +77,13 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s2, v1
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -119,14 +128,18 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s3
; GFX6-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s2
; GFX6-NEXT: v_sub_f32_e32 v2, s2, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
@@ -145,14 +158,18 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s2
; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
@@ -171,14 +188,18 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v2, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
@@ -194,20 +215,22 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_f32_e32 v1, s3, v0
; GFX11-NEXT: v_sub_f32_e32 v3, s2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v1|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v3|, 0.5
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4
+; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, 0.5
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v3, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: s_and_b32 s2, s4, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: s_and_b32 s3, s5, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 1.0, 0
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s2, v1
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s3, v3
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s2
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v1, v0, v1 :: v_dual_add_f32 v0, v2, v3
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -251,28 +274,36 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX6-NEXT: v_trunc_f32_e32 v0, s7
; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT: s_cselect_b32 s8, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
; GFX6-NEXT: v_mov_b32_e32 v2, s7
; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX6-NEXT: v_add_f32_e32 v3, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s6
; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT: s_cselect_b32 s7, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s7
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX6-NEXT: v_add_f32_e32 v2, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s5
; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT: s_cselect_b32 s6, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s6
; GFX6-NEXT: v_mov_b32_e32 v4, s5
; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v4
; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s4
; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT: s_cselect_b32 s5, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v4, s5
; GFX6-NEXT: v_mov_b32_e32 v5, s4
; GFX6-NEXT: v_bfi_b32 v4, s10, v4, v5
; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
@@ -290,28 +321,36 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX8-NEXT: v_trunc_f32_e32 v0, s7
; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX8-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX8-NEXT: s_cselect_b32 s8, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s6
; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX8-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX8-NEXT: s_cselect_b32 s7, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s5
; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX8-NEXT: s_cselect_b32 s6, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s6
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4
; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s4
; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX8-NEXT: s_cselect_b32 s5, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
@@ -329,28 +368,36 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX9-NEXT: v_trunc_f32_e32 v0, s7
; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX9-NEXT: s_cselect_b32 s8, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec
+; GFX9-NEXT: s_cselect_b32 s7, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s5
; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX9-NEXT: s_cselect_b32 s6, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v4
; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s4
; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX9-NEXT: s_cselect_b32 s5, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: v_bfi_b32 v4, s10, v4, v5
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
@@ -362,36 +409,40 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
-; GFX11-NEXT: v_trunc_f32_e32 v1, s6
+; GFX11-NEXT: v_trunc_f32_e32 v2, s6
; GFX11-NEXT: v_trunc_f32_e32 v4, s5
; GFX11-NEXT: v_trunc_f32_e32 v5, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
-; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_sub_f32 v6, s7, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_sub_f32 v8, s6, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_f32_e32 v9, s5, v4
+; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_sub_f32 v10, s4, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5
-; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
-; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v8|, 0.5
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v9|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v10|, 0.5
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 1.0, 0
+; GFX11-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s2, v1
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: s_and_b32 s4, s5, exec_lo
+; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, s3, v3
+; GFX11-NEXT: s_cselect_b32 s3, 1.0, 0
+; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s2, v7
+; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, s3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v3, v0, v1 :: v_dual_add_f32 v2, v2, v8
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_dual_add_f32 v1, v4, v7 :: v_dual_add_f32 v0, v5, v6
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4
-; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -439,66 +490,82 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX6-LABEL: round_v8f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
+; GFX6-NEXT: s_brev_b32 s16, -2
; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9
-; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: s_mov_b32 s15, 0xf000
; GFX6-NEXT: s_mov_b32 s14, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e32 v0, s7
; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s7
-; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX6-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX6-NEXT: v_add_f32_e32 v3, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s6
; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX6-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX6-NEXT: v_add_f32_e32 v2, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s5
; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v4, s5
-; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4
+; GFX6-NEXT: v_bfi_b32 v1, s16, v1, v4
; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s4
; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v4, s0
; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5
+; GFX6-NEXT: v_bfi_b32 v4, s16, v4, v5
; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
; GFX6-NEXT: v_trunc_f32_e32 v4, s11
; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v5, s0
; GFX6-NEXT: v_mov_b32_e32 v6, s11
-; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6
+; GFX6-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX6-NEXT: v_add_f32_e32 v7, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v4, s10
; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v5, s0
; GFX6-NEXT: v_mov_b32_e32 v6, s10
-; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6
+; GFX6-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX6-NEXT: v_add_f32_e32 v6, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v4, s9
; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v5, s0
; GFX6-NEXT: v_mov_b32_e32 v8, s9
-; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8
+; GFX6-NEXT: v_bfi_b32 v5, s16, v5, v8
; GFX6-NEXT: v_add_f32_e32 v5, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v4, s8
; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX6-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX6-NEXT: v_mov_b32_e32 v8, s0
; GFX6-NEXT: v_mov_b32_e32 v9, s8
-; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9
+; GFX6-NEXT: v_bfi_b32 v8, s16, v8, v9
; GFX6-NEXT: v_add_f32_e32 v4, v4, v8
; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
@@ -507,66 +574,82 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX8-LABEL: round_v8f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
+; GFX8-NEXT: s_brev_b32 s16, -2
; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
-; GFX8-NEXT: s_brev_b32 s2, -2
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f32_e32 v0, s7
; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX8-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s6
; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX8-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s5
; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v4
+; GFX8-NEXT: v_bfi_b32 v1, s16, v1, v4
; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s4
; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: v_bfi_b32 v4, s2, v4, v5
+; GFX8-NEXT: v_bfi_b32 v4, s16, v4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
; GFX8-NEXT: v_trunc_f32_e32 v4, s11
; GFX8-NEXT: v_sub_f32_e32 v5, s11, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s0
; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6
+; GFX8-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX8-NEXT: v_add_f32_e32 v7, v4, v5
; GFX8-NEXT: v_trunc_f32_e32 v4, s10
; GFX8-NEXT: v_sub_f32_e32 v5, s10, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s0
; GFX8-NEXT: v_mov_b32_e32 v6, s10
-; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6
+; GFX8-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX8-NEXT: v_add_f32_e32 v6, v4, v5
; GFX8-NEXT: v_trunc_f32_e32 v4, s9
; GFX8-NEXT: v_sub_f32_e32 v5, s9, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s0
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v8
+; GFX8-NEXT: v_bfi_b32 v5, s16, v5, v8
; GFX8-NEXT: v_add_f32_e32 v5, v4, v5
; GFX8-NEXT: v_trunc_f32_e32 v4, s8
; GFX8-NEXT: v_sub_f32_e32 v8, s8, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NEXT: v_mov_b32_e32 v9, s8
-; GFX8-NEXT: v_bfi_b32 v8, s2, v8, v9
+; GFX8-NEXT: v_bfi_b32 v8, s16, v8, v9
; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
@@ -575,66 +658,82 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX9-LABEL: round_v8f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
+; GFX9-NEXT: s_brev_b32 s16, -2
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
-; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: s_mov_b32 s15, 0xf000
; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e32 v0, s7
; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX9-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX9-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s5
; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4
+; GFX9-NEXT: v_bfi_b32 v1, s16, v1, v4
; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s4
; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5
+; GFX9-NEXT: v_bfi_b32 v4, s16, v4, v5
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, s11
; GFX9-NEXT: v_sub_f32_e32 v5, s11, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s0
; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6
+; GFX9-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX9-NEXT: v_add_f32_e32 v7, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v4, s10
; GFX9-NEXT: v_sub_f32_e32 v5, s10, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s0
; GFX9-NEXT: v_mov_b32_e32 v6, s10
-; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6
+; GFX9-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX9-NEXT: v_add_f32_e32 v6, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v4, s9
; GFX9-NEXT: v_sub_f32_e32 v5, s9, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s0
; GFX9-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v8
+; GFX9-NEXT: v_bfi_b32 v5, s16, v5, v8
; GFX9-NEXT: v_add_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v4, s8
; GFX9-NEXT: v_sub_f32_e32 v8, s8, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s0
; GFX9-NEXT: v_mov_b32_e32 v9, s8
-; GFX9-NEXT: v_bfi_b32 v8, s2, v8, v9
+; GFX9-NEXT: v_bfi_b32 v8, s16, v8, v9
; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
@@ -645,67 +744,73 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
-; GFX11-NEXT: v_trunc_f32_e32 v1, s6
+; GFX11-NEXT: v_trunc_f32_e32 v2, s6
+; GFX11-NEXT: v_mov_b32_e32 v3, s6
+; GFX11-NEXT: v_trunc_f32_e32 v10, s10
; GFX11-NEXT: v_trunc_f32_e32 v4, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_sub_f32 v14, s7, v0 :: v_dual_sub_f32 v15, s6, v2
; GFX11-NEXT: v_trunc_f32_e32 v8, s4
-; GFX11-NEXT: v_trunc_f32_e32 v5, s11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
-; GFX11-NEXT: v_sub_f32_e32 v7, s5, v4
-; GFX11-NEXT: v_trunc_f32_e32 v9, s9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
-; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8
-; GFX11-NEXT: v_trunc_f32_e32 v6, s10
-; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9
-; GFX11-NEXT: v_trunc_f32_e32 v10, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5
-; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16
-; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5
-; GFX11-NEXT: v_add_f32_e32 v1, v4, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11
-; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2
+; GFX11-NEXT: v_trunc_f32_e32 v7, s11
+; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_sub_f32 v16, s5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e32 v7, v5, v12
-; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10
-; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2
-; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11
+; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v15|, 0.5
+; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_sub_f32 v18, s11, v7
+; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v6, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 1.0, 0
+; GFX11-NEXT: v_trunc_f32_e32 v13, s8
+; GFX11-NEXT: v_bfi_b32 v14, 0x7fffffff, s3, v3
+; GFX11-NEXT: v_sub_f32_e32 v17, s4, v8
+; GFX11-NEXT: v_sub_f32_e32 v19, s10, v10
+; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v16|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v18|, 0.5
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v14
+; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v17|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s7, |v19|, 0.5
+; GFX11-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s2, v1
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: v_trunc_f32_e32 v12, s9
+; GFX11-NEXT: v_mov_b32_e32 v11, s10
+; GFX11-NEXT: s_and_b32 s4, s5, exec_lo
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s2, v5
+; GFX11-NEXT: s_cselect_b32 s3, 1.0, 0
+; GFX11-NEXT: v_sub_f32_e32 v20, s9, v12
+; GFX11-NEXT: s_and_b32 s4, s6, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: s_and_b32 s4, s7, exec_lo
+; GFX11-NEXT: s_cselect_b32 s5, 1.0, 0
+; GFX11-NEXT: v_add_f32_e32 v3, v0, v1
+; GFX11-NEXT: v_add_f32_e32 v1, v4, v5
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v9
+; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, s5, v11
+; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, s3, v6
+; GFX11-NEXT: v_sub_f32_e32 v21, s8, v13
+; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v20|, 0.5
+; GFX11-NEXT: v_add_f32_e32 v7, v7, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_add_f32 v0, v8, v11
+; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v21|, 0.5
+; GFX11-NEXT: v_mov_b32_e32 v9, s8
+; GFX11-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX11-NEXT: v_add_f32_e32 v6, v10, v4
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s4, v5
+; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, s2, v9
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8
-; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v5, v12, v5
+; GFX11-NEXT: v_add_f32_e32 v4, v13, v9
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
@@ -786,9 +891,11 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
; GFX6-NEXT: v_trunc_f32_e32 v1, v0
; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
-; GFX6-NEXT: s_brev_b32 s2, -2
-; GFX6-NEXT: v_bfi_b32 v0, s2, v2, v0
+; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX6-NEXT: s_cselect_b32 s2, 1.0, 0
+; GFX6-NEXT: s_brev_b32 s3, -2
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_bfi_b32 v0, s3, v2, v0
; GFX6-NEXT: v_add_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -894,7 +1001,7 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX6-LABEL: round_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb
-; GFX6-NEXT: s_brev_b32 s4, -2
+; GFX6-NEXT: s_brev_b32 s6, -2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1
@@ -903,14 +1010,18 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX6-NEXT: v_trunc_f32_e32 v3, v1
; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3
; GFX6-NEXT: v_trunc_f32_e32 v2, v0
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, 0.5
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
; GFX6-NEXT: v_sub_f32_e32 v4, v0, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[2:3]
-; GFX6-NEXT: v_bfi_b32 v1, s4, v5, v1
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5
+; GFX6-NEXT: s_cselect_b32 s4, 1.0, 0
+; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: v_bfi_b32 v1, s6, v4, v1
+; GFX6-NEXT: s_cselect_b32 s2, 1.0, 0
; GFX6-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[2:3]
-; GFX6-NEXT: v_bfi_b32 v0, s4, v3, v0
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-NEXT: v_bfi_b32 v0, s6, v3, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_add_f32_e32 v0, v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
index 40a8592dba6df0..a72d1702ab0f72 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
@@ -637,9 +637,10 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp
; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
+; SI-IEEE-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
+; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
+; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
@@ -762,14 +763,15 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
+; GCN-DAZ-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
+; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
@@ -802,9 +804,10 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
+; SI-IEEE-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0xcf800000, v0
; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
+; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1]
; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
@@ -818,9 +821,10 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
+; SI-IEEE-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
+; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
+; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
@@ -844,9 +848,10 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
+; CI-IEEE-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0xcf800000, v0
; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
+; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1]
; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 8e0a83671a1837..668ed784f2b099 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1143,11 +1143,13 @@ exit:
define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; SI-LABEL: phi_use_def_before_kill:
; SI: ; %bb.0: ; %bb
-; SI-NEXT: v_add_f32_e64 v1, s0, 1.0
-; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
-; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
-; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: v_add_f32_e64 v0, s0, 1.0
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; SI-NEXT: s_cselect_b32 s0, -1.0, 0
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
; SI-NEXT: s_cbranch_scc0 .LBB11_6
; SI-NEXT: ; %bb.1: ; %bb
; SI-NEXT: s_andn2_b64 exec, exec, vcc
@@ -1157,14 +1159,16 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, 4.0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s0, 4.0
; SI-NEXT: .LBB11_3: ; %phibb
-; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; SI-NEXT: v_cmp_eq_f32_e64 s[0:1], s0, 0
+; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
; SI-NEXT: s_cbranch_vccz .LBB11_5
; SI-NEXT: ; %bb.4: ; %bb10
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1177,22 +1181,25 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
;
; GFX10-WAVE64-LABEL: phi_use_def_before_kill:
; GFX10-WAVE64: ; %bb.0: ; %bb
-; GFX10-WAVE64-NEXT: v_add_f32_e64 v1, s0, 1.0
-; GFX10-WAVE64-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
-; GFX10-WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
-; GFX10-WAVE64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
-; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-WAVE64-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX10-WAVE64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT: s_cselect_b32 s0, -1.0, 0
+; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB11_6
; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb
; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB11_3
; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb8
-; GFX10-WAVE64-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX10-WAVE64-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 8
+; GFX10-WAVE64-NEXT: s_mov_b32 s0, 4.0
+; GFX10-WAVE64-NEXT: global_store_dword v[0:1], v0, off
; GFX10-WAVE64-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WAVE64-NEXT: .LBB11_3: ; %phibb
-; GFX10-WAVE64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX10-WAVE64-NEXT: v_cmp_eq_f32_e64 s[0:1], s0, 0
+; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX10-WAVE64-NEXT: s_cbranch_vccz .LBB11_5
; GFX10-WAVE64-NEXT: ; %bb.4: ; %bb10
; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 9
@@ -1207,22 +1214,25 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
;
; GFX10-WAVE32-LABEL: phi_use_def_before_kill:
; GFX10-WAVE32: ; %bb.0: ; %bb
-; GFX10-WAVE32-NEXT: v_add_f32_e64 v1, s0, 1.0
-; GFX10-WAVE32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0, v1
-; GFX10-WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX10-WAVE32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1
-; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
+; GFX10-WAVE32-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-WAVE32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0, v0
+; GFX10-WAVE32-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX10-WAVE32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0
+; GFX10-WAVE32-NEXT: s_cselect_b32 s0, -1.0, 0
+; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo
; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB11_6
; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb
; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB11_3
; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb8
-; GFX10-WAVE32-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX10-WAVE32-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 8
+; GFX10-WAVE32-NEXT: s_mov_b32 s0, 4.0
+; GFX10-WAVE32-NEXT: global_store_dword v[0:1], v0, off
; GFX10-WAVE32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WAVE32-NEXT: .LBB11_3: ; %phibb
-; GFX10-WAVE32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX10-WAVE32-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
+; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s0
; GFX10-WAVE32-NEXT: s_cbranch_vccz .LBB11_5
; GFX10-WAVE32-NEXT: ; %bb.4: ; %bb10
; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 9
@@ -1237,23 +1247,27 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
;
; GFX11-LABEL: phi_use_def_before_kill:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_add_f32_e64 v1, s0, 1.0
+; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
-; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
-; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_cselect_b32 s0, -1.0, 0
+; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB11_6
; GFX11-NEXT: ; %bb.1: ; %bb
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
; GFX11-NEXT: ; %bb.2: ; %bb8
-; GFX11-NEXT: v_mov_b32_e32 v1, 8
-; GFX11-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
+; GFX11-NEXT: v_mov_b32_e32 v0, 8
+; GFX11-NEXT: s_mov_b32 s0, 4.0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB11_3: ; %phibb
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX11-NEXT: v_cmp_eq_f32_e64 s[0:1], s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX11-NEXT: s_cbranch_vccz .LBB11_5
; GFX11-NEXT: ; %bb.4: ; %bb10
; GFX11-NEXT: v_mov_b32_e32 v0, 9
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index b87439a9d6fae7..7e1b6b23d64192 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -351,13 +351,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %541:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %554:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index d1bf5ecb569849..76edf87a36ffd1 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -111,14 +111,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_cmp_eq_u32 s2, 0
+; SI-NEXT: s_cselect_b32 s2, s3, -1
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_cmp_eq_u32 s2, 0
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -127,11 +126,10 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; VI-NEXT: s_cselect_b32 s2, s3, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -141,8 +139,8 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s6, 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1]
+; GFX10-NEXT: s_cselect_b32 s0, s7, -1
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
@@ -152,9 +150,9 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s2, 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX11-NEXT: s_cselect_b32 s2, s3, -1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -176,11 +174,12 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s5
-; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
-; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SI-NEXT: v_cmp_nlg_f32_e64 s[6:7], s4, 0
+; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; SI-NEXT: s_cselect_b32 s4, s5, 1.0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -192,10 +191,11 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT: s_cselect_b32 s0, s3, 1.0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -207,7 +207,9 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cselect_b32 s0, s1, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
@@ -221,7 +223,10 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
+; GFX11-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX11-NEXT: s_cselect_b32 s0, s1, 1.0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -239,15 +244,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; SI-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-NEXT: s_load_dword s6, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], s6, 0
+; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_cselect_b32 s4, s6, 1.0
+; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: v_mov_b32_e32 v2, s4
-; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
-; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -259,10 +265,11 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT: s_cselect_b32 s0, s2, 1.0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -274,7 +281,9 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cselect_b32 s2, s4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -288,7 +297,10 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
+; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX11-NEXT: s_cselect_b32 s2, s4, 1.0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -310,11 +322,12 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s5
-; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
-; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-NEXT: v_cmp_nlg_f32_e64 s[6:7], s4, 0
+; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; SI-NEXT: s_cselect_b32 s4, s5, 0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -326,10 +339,11 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT: s_cselect_b32 s0, s3, 0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -341,7 +355,9 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cselect_b32 s0, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
@@ -355,7 +371,10 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5]
+; GFX11-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX11-NEXT: s_cselect_b32 s0, s1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -373,15 +392,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; SI-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-NEXT: s_load_dword s6, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], s6, 0
+; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_cselect_b32 s4, s6, 0
+; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: v_mov_b32_e32 v2, s4
-; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
-; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -393,10 +413,11 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT: s_cselect_b32 s0, s2, 0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -408,7 +429,9 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cselect_b32 s2, s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -422,7 +445,10 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
+; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX11-NEXT: s_cselect_b32 s2, s4, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 3df757a426ae96..6ecedc25ca963e 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -73,22 +73,24 @@ entry:
define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; SI-LABEL: test_select_v2f32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_cmp_neq_f32_e64 s[0:1], s9, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT: s_cselect_b32 s3, s9, s3
+; SI-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-NEXT: s_cselect_b32 s0, s8, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -106,12 +108,14 @@ define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspac
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_cmp_neq_f32_e64 s[0:1], s3, v1
; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT: s_cselect_b32 s3, s3, s9
+; VI-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-NEXT: s_cselect_b32 s0, s2, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -237,62 +241,70 @@ entry:
define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; SI-LABEL: test_select_v4f32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; SI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx4 s[16:19], s[10:11], 0x0
+; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s8
-; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: v_mov_b32_e32 v2, s10
-; SI-NEXT: v_mov_b32_e32 v3, s11
-; SI-NEXT: v_mov_b32_e32 v4, s3
-; SI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v4, s2
-; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v4, s1
-; SI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v4, s0
-; SI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, s16, v0
+; SI-NEXT: v_mov_b32_e32 v0, s13
+; SI-NEXT: v_cmp_neq_f32_e64 s[0:1], s17, v0
+; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], s18, v0
+; SI-NEXT: v_mov_b32_e32 v0, s15
+; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s19, v0
+; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_cselect_b32 s4, s19, s15
+; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; SI-NEXT: s_cselect_b32 s2, s18, s14
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT: s_cselect_b32 s3, s17, s13
+; SI-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-NEXT: s_cselect_b32 s0, s16, s12
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: v_mov_b32_e32 v3, s4
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_select_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x0
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: v_cmp_neq_f32_e32 vcc, s16, v0
+; VI-NEXT: v_mov_b32_e32 v0, s14
+; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], s18, v0
+; VI-NEXT: v_mov_b32_e32 v0, s15
+; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s19, v0
+; VI-NEXT: v_mov_b32_e32 v1, s13
+; VI-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT: v_cmp_neq_f32_e64 s[0:1], s17, v1
+; VI-NEXT: s_cselect_b32 s4, s19, s15
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT: s_cselect_b32 s2, s18, s14
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT: s_cselect_b32 s3, s17, s13
+; VI-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-NEXT: s_cselect_b32 s0, s16, s12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: test_select_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
index a7650d1286ddc9..09727fbe02385b 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
@@ -9,7 +9,9 @@ define amdgpu_ps float @xor3_i1_const(float inreg %arg1, i32 inreg %arg2) {
; GCN-NEXT: v_cmp_lt_f32_e64 s[2:3], s0, 0
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
; GCN-NEXT: s_and_b64 s[0:1], s[2:3], vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 0, 1.0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: ; return to shader part epilog
main_body:
%tmp26 = fcmp nsz olt float %arg1, 0.000000e+00
More information about the llvm-commits
mailing list