[llvm] e456579 - [AMDGPU][RegBankCombiner] Add cast_of_cast and constant_fold_cast combines (#131307)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 02:26:55 PDT 2025
Author: Pierre van Houtryve
Date: 2025-03-17T10:26:51+01:00
New Revision: e456579e346c0790603544dc7617edfe44953f4c
URL: https://github.com/llvm/llvm-project/commit/e456579e346c0790603544dc7617edfe44953f4c
DIFF: https://github.com/llvm/llvm-project/commit/e456579e346c0790603544dc7617edfe44953f4c.diff
LOG: [AMDGPU][RegBankCombiner] Add cast_of_cast and constant_fold_cast combines (#131307)
We can add a bunch of exts/truncs during RBSelect, we should be able to fold
them away afterwards.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 36653867fbba0..a21505356274b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -180,5 +180,6 @@ def AMDGPURegBankCombiner : GICombiner<
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
- identity_combines, redundant_and]> {
+ identity_combines, redundant_and, constant_fold_cast_op,
+ cast_of_cast_combines]> {
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3a52497bd6e91..07fcb02d98649 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -41,10 +41,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_and_b32 s2, s2, 0x7f
; GFX8-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@@ -72,10 +71,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_and_b32 s2, s2, 0x7f
; GFX9-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
@@ -102,9 +100,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX10-NEXT: s_and_b32 s2, s2, 0x7f
; GFX10-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -134,9 +131,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11-NEXT: s_and_b32 s2, s2, 0x7f
; GFX11-NEXT: s_and_b32 s1, s1, 0x7f
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -351,11 +347,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s3, s2, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -365,11 +358,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s3, s2, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -379,11 +369,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s3, s2, 7
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -393,11 +380,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s3, s2, 7
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -489,7 +473,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-LABEL: s_fshl_i8_4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: s_lshr_b32 s1, s1, 4
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -498,7 +481,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-LABEL: s_fshl_i8_4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
; GFX9-NEXT: s_lshr_b32 s1, s1, 4
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -508,7 +490,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 4
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -517,9 +498,8 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4)
@@ -586,7 +566,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-LABEL: s_fshl_i8_5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 5
; GFX8-NEXT: s_lshr_b32 s1, s1, 3
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -595,7 +574,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-LABEL: s_fshl_i8_5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 5
; GFX9-NEXT: s_lshr_b32 s1, s1, 3
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -605,7 +583,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, 5
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 3
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -614,9 +591,8 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5)
@@ -702,23 +678,17 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
; GFX8-NEXT: s_and_b32 s6, s2, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s2, s4, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s5
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_andn2_b32 s3, 7, s5
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
@@ -733,23 +703,17 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
; GFX9-NEXT: s_and_b32 s6, s2, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s5, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s2, s4, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s5
; GFX9-NEXT: s_lshr_b32 s2, s2, 1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s3, 7, s5
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
@@ -761,25 +725,19 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX10-LABEL: s_fshl_v2i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s4, s1, 8
-; GFX10-NEXT: s_and_b32 s5, s2, 7
-; GFX10-NEXT: s_lshr_b32 s6, s2, 8
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_lshr_b32 s5, s2, 8
+; GFX10-NEXT: s_and_b32 s6, s2, 7
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, s5
-; GFX10-NEXT: s_and_b32 s5, s6, 7
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT: s_andn2_b32 s6, 7, s6
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_lshl_b32 s0, s0, s6
+; GFX10-NEXT: s_and_b32 s6, s5, 7
; GFX10-NEXT: s_lshr_b32 s4, s4, 1
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_andn2_b32 s5, 7, s5
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshl_b32 s3, s3, s5
-; GFX10-NEXT: s_lshr_b32 s4, s4, s6
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_lshl_b32 s3, s3, s6
+; GFX10-NEXT: s_lshr_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s2, s3, s4
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -792,25 +750,19 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX11-LABEL: s_fshl_v2i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s4, s1, 8
-; GFX11-NEXT: s_and_b32 s5, s2, 7
-; GFX11-NEXT: s_lshr_b32 s6, s2, 8
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-NEXT: s_and_b32 s6, s2, 7
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s0, s0, s5
-; GFX11-NEXT: s_and_b32 s5, s6, 7
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s6
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_and_b32 s6, s5, 7
; GFX11-NEXT: s_lshr_b32 s4, s4, 1
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s5
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s4, s4, s6
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_lshl_b32 s3, s3, s6
+; GFX11-NEXT: s_lshr_b32 s4, s4, s5
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_or_b32 s2, s3, s4
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -1030,11 +982,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
; GFX8-NEXT: s_and_b32 s12, s2, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
@@ -1042,29 +991,24 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s9, 7
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s2, s6, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_andn2_b32 s3, 7, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s10, 7
-; GFX8-NEXT: s_lshl_b32 s2, s4, s2
; GFX8-NEXT: s_and_b32 s3, s7, 0xff
-; GFX8-NEXT: s_andn2_b32 s4, 7, s10
+; GFX8-NEXT: s_lshl_b32 s2, s4, s2
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_andn2_b32 s4, 7, s10
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, s11, 7
-; GFX8-NEXT: s_lshl_b32 s3, s5, s3
-; GFX8-NEXT: s_andn2_b32 s5, 7, s11
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_lshl_b32 s3, s5, s3
; GFX8-NEXT: s_lshr_b32 s4, s8, 1
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_andn2_b32 s5, 7, s11
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
; GFX8-NEXT: s_lshr_b32 s4, s4, s5
@@ -1088,11 +1032,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
; GFX9-NEXT: s_and_b32 s12, s2, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
@@ -1100,29 +1041,24 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s9, 7
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s2, s6, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s9
; GFX9-NEXT: s_lshr_b32 s2, s2, 1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s3, 7, s9
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s10, 7
-; GFX9-NEXT: s_lshl_b32 s2, s4, s2
; GFX9-NEXT: s_and_b32 s3, s7, 0xff
-; GFX9-NEXT: s_andn2_b32 s4, 7, s10
+; GFX9-NEXT: s_lshl_b32 s2, s4, s2
; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT: s_andn2_b32 s4, 7, s10
; GFX9-NEXT: s_lshr_b32 s3, s3, s4
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s3, s11, 7
-; GFX9-NEXT: s_lshl_b32 s3, s5, s3
-; GFX9-NEXT: s_andn2_b32 s5, 7, s11
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s3, s5, s3
; GFX9-NEXT: s_lshr_b32 s4, s8, 1
-; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX9-NEXT: s_andn2_b32 s5, 7, s11
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
@@ -1146,41 +1082,33 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_lshr_b32 s10, s2, 16
; GFX10-NEXT: s_lshr_b32 s11, s2, 24
; GFX10-NEXT: s_and_b32 s12, s2, 7
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_and_b32 s2, s9, 7
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_and_b32 s2, s6, 0xff
+; GFX10-NEXT: s_and_b32 s6, s9, 7
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: s_andn2_b32 s9, 7, s9
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshr_b32 s6, s6, 1
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s2, s3, s2
-; GFX10-NEXT: s_lshr_b32 s3, s6, s9
+; GFX10-NEXT: s_lshl_b32 s3, s3, s6
+; GFX10-NEXT: s_lshr_b32 s2, s2, s9
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s2, s3
-; GFX10-NEXT: s_and_b32 s3, s7, 0xff
+; GFX10-NEXT: s_or_b32 s1, s3, s2
+; GFX10-NEXT: s_and_b32 s2, s7, 0xff
+; GFX10-NEXT: s_and_b32 s3, s10, 7
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: s_andn2_b32 s6, 7, s10
-; GFX10-NEXT: s_lshr_b32 s3, s3, 1
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s2, s10, 7
-; GFX10-NEXT: s_lshr_b32 s3, s3, s6
-; GFX10-NEXT: s_andn2_b32 s6, 7, s11
-; GFX10-NEXT: s_lshl_b32 s2, s4, s2
+; GFX10-NEXT: s_lshl_b32 s3, s4, s3
+; GFX10-NEXT: s_lshr_b32 s2, s2, s6
; GFX10-NEXT: s_and_b32 s4, s11, 7
-; GFX10-NEXT: s_lshr_b32 s7, s8, 1
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_lshr_b32 s6, s8, 1
+; GFX10-NEXT: s_andn2_b32 s7, 7, s11
; GFX10-NEXT: s_lshl_b32 s4, s5, s4
-; GFX10-NEXT: s_lshr_b32 s5, s7, s6
-; GFX10-NEXT: s_or_b32 s2, s2, s3
+; GFX10-NEXT: s_lshr_b32 s5, s6, s7
+; GFX10-NEXT: s_or_b32 s2, s3, s2
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
@@ -1204,41 +1132,33 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_lshr_b32 s10, s2, 16
; GFX11-NEXT: s_lshr_b32 s11, s2, 24
; GFX11-NEXT: s_and_b32 s12, s2, 7
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_and_b32 s2, s9, 7
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_and_b32 s2, s6, 0xff
+; GFX11-NEXT: s_and_b32 s6, s9, 7
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshr_b32 s6, s6, 1
-; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s2, s3, s2
-; GFX11-NEXT: s_lshr_b32 s3, s6, s9
+; GFX11-NEXT: s_lshl_b32 s3, s3, s6
+; GFX11-NEXT: s_lshr_b32 s2, s2, s9
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s3, s7, 0xff
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_and_b32 s2, s7, 0xff
+; GFX11-NEXT: s_and_b32 s3, s10, 7
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
; GFX11-NEXT: s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT: s_lshr_b32 s3, s3, 1
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s2, s10, 7
-; GFX11-NEXT: s_lshr_b32 s3, s3, s6
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s11
-; GFX11-NEXT: s_lshl_b32 s2, s4, s2
+; GFX11-NEXT: s_lshl_b32 s3, s4, s3
+; GFX11-NEXT: s_lshr_b32 s2, s2, s6
; GFX11-NEXT: s_and_b32 s4, s11, 7
-; GFX11-NEXT: s_lshr_b32 s7, s8, 1
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_lshr_b32 s6, s8, 1
+; GFX11-NEXT: s_and_not1_b32 s7, 7, s11
; GFX11-NEXT: s_lshl_b32 s4, s5, s4
-; GFX11-NEXT: s_lshr_b32 s5, s7, s6
-; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_lshr_b32 s5, s6, s7
+; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_or_b32 s3, s4, s5
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
@@ -1852,20 +1772,19 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
; GFX6-NEXT: s_lshr_b32 s6, s0, 16
-; GFX6-NEXT: s_lshr_b32 s7, s1, 8
+; GFX6-NEXT: s_and_b32 s8, s0, 0xff
; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT: s_and_b32 s8, s0, 0xff
; GFX6-NEXT: s_lshl_b32 s9, s9, 8
+; GFX6-NEXT: s_lshr_b32 s7, s1, 8
; GFX6-NEXT: s_or_b32 s8, s8, s9
; GFX6-NEXT: s_and_b32 s6, s6, 0xff
; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_and_b32 s0, s7, 0xff
; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT: s_and_b32 s0, s7, 0xff
; GFX6-NEXT: v_not_b32_e32 v3, 23
; GFX6-NEXT: s_or_b32 s6, s8, s6
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -1874,18 +1793,17 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
-; GFX6-NEXT: s_lshr_b32 s1, s3, 8
; GFX6-NEXT: s_and_b32 s7, s2, 0xff
; GFX6-NEXT: s_lshl_b32 s8, s8, 8
+; GFX6-NEXT: s_lshr_b32 s1, s3, 8
; GFX6-NEXT: s_or_b32 s7, s7, s8
; GFX6-NEXT: s_and_b32 s0, s0, 0xff
; GFX6-NEXT: s_and_b32 s3, s3, 0xff
; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: s_or_b32 s0, s7, s0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
@@ -1905,9 +1823,8 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: s_lshr_b32 s2, s5, 8
; GFX6-NEXT: s_and_b32 s3, s5, 0xff
; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: s_and_b32 s2, s2, 0xff
; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX6-NEXT: s_and_b32 s2, s2, 0xff
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
@@ -1964,48 +1881,42 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s6, s0, 8
; GFX8-NEXT: s_and_b32 s6, s6, 0xff
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: s_lshr_b32 s7, s0, 16
; GFX8-NEXT: s_lshr_b32 s8, s0, 24
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
+; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_or_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: s_lshr_b32 s9, s1, 8
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_or_b32 s0, s0, s6
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
; GFX8-NEXT: s_and_b32 s6, s9, 0xff
; GFX8-NEXT: s_or_b32 s1, s8, s1
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: s_or_b32 s1, s1, s6
; GFX8-NEXT: s_lshr_b32 s6, s2, 8
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX8-NEXT: s_and_b32 s6, s6, 0xff
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT: s_and_b32 s6, s6, 0xff
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_lshr_b32 s8, s2, 24
; GFX8-NEXT: s_and_b32 s2, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_lshr_b32 s9, s3, 8
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xff
-; GFX8-NEXT: v_not_b32_e32 v1, 23
+; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_lshl_b32 s3, s3, 8
; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX8-NEXT: s_or_b32 s3, s8, s3
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_or_b32 s3, s3, s6
; GFX8-NEXT: s_lshr_b32 s6, s4, 8
@@ -2017,7 +1928,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
; GFX8-NEXT: s_or_b32 s4, s4, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_or_b32 s4, s4, s6
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@@ -2028,8 +1938,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24
; GFX8-NEXT: s_and_b32 s6, s9, 0xff
; GFX8-NEXT: s_or_b32 s5, s8, s5
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_or_b32 s5, s5, s6
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2
@@ -2081,48 +1989,42 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s6, s0, 8
; GFX9-NEXT: s_and_b32 s6, s6, 0xff
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: s_lshr_b32 s7, s0, 16
; GFX9-NEXT: s_lshr_b32 s8, s0, 24
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_or_b32 s0, s0, s6
; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: s_lshr_b32 s9, s1, 8
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_or_b32 s0, s0, s6
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
; GFX9-NEXT: s_and_b32 s6, s9, 0xff
; GFX9-NEXT: s_or_b32 s1, s8, s1
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_or_b32 s1, s1, s6
; GFX9-NEXT: s_lshr_b32 s6, s2, 8
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT: s_and_b32 s6, s6, 0xff
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT: s_and_b32 s6, s6, 0xff
; GFX9-NEXT: s_lshr_b32 s7, s2, 16
; GFX9-NEXT: s_lshr_b32 s8, s2, 24
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_or_b32 s2, s2, s6
; GFX9-NEXT: s_and_b32 s6, s7, 0xff
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GFX9-NEXT: s_lshr_b32 s9, s3, 8
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_and_b32 s3, s3, 0xff
-; GFX9-NEXT: v_not_b32_e32 v1, 23
+; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX9-NEXT: s_or_b32 s2, s2, s6
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_and_b32 s6, s9, 0xff
-; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX9-NEXT: s_or_b32 s3, s8, s3
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s3, s3, s6
; GFX9-NEXT: s_lshr_b32 s6, s4, 8
@@ -2134,7 +2036,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s4, s4, s6
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
@@ -2142,11 +2043,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshr_b32 s9, s5, 8
; GFX9-NEXT: s_and_b32 s5, s5, 0xff
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24
; GFX9-NEXT: s_and_b32 s6, s9, 0xff
; GFX9-NEXT: s_or_b32 s5, s8, s5
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s5, s5, s6
; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
@@ -2196,87 +2095,78 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-LABEL: s_fshl_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT: s_lshr_b32 s14, s4, 8
-; GFX10-NEXT: s_lshr_b32 s15, s4, 16
-; GFX10-NEXT: s_and_b32 s14, s14, 0xff
-; GFX10-NEXT: s_lshr_b32 s16, s4, 24
+; GFX10-NEXT: s_lshr_b32 s10, s4, 8
+; GFX10-NEXT: s_lshr_b32 s11, s4, 16
+; GFX10-NEXT: s_and_b32 s10, s10, 0xff
+; GFX10-NEXT: s_lshr_b32 s12, s4, 24
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
-; GFX10-NEXT: s_lshl_b32 s14, s14, 8
-; GFX10-NEXT: s_and_b32 s15, s15, 0xff
-; GFX10-NEXT: s_or_b32 s4, s4, s14
-; GFX10-NEXT: s_lshr_b32 s17, s5, 8
+; GFX10-NEXT: s_and_b32 s11, s11, 0xff
+; GFX10-NEXT: s_lshl_b32 s10, s10, 8
+; GFX10-NEXT: s_lshl_b32 s11, s11, 16
+; GFX10-NEXT: s_or_b32 s4, s4, s10
+; GFX10-NEXT: s_lshr_b32 s13, s5, 8
; GFX10-NEXT: s_and_b32 s5, s5, 0xff
-; GFX10-NEXT: s_lshl_b32 s15, s15, 16
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_or_b32 s4, s4, s11
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT: s_and_b32 s17, s17, 0xff
-; GFX10-NEXT: s_or_b32 s4, s4, s15
-; GFX10-NEXT: s_or_b32 s5, s16, s5
-; GFX10-NEXT: s_and_b32 s16, 0xffff, s17
+; GFX10-NEXT: s_and_b32 s13, s13, 0xff
+; GFX10-NEXT: s_or_b32 s5, s12, s5
+; GFX10-NEXT: s_lshl_b32 s10, s13, 16
+; GFX10-NEXT: s_lshr_b32 s9, s1, 8
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT: s_lshl_b32 s14, s16, 16
+; GFX10-NEXT: s_or_b32 s5, s5, s10
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshr_b32 s10, s2, 8
-; GFX10-NEXT: s_or_b32 s5, s5, s14
+; GFX10-NEXT: s_lshr_b32 s8, s0, 24
; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX10-NEXT: s_lshr_b32 s9, s1, 8
+; GFX10-NEXT: s_lshr_b32 s11, s2, 16
; GFX10-NEXT: s_lshr_b32 s13, s3, 8
; GFX10-NEXT: s_and_b32 s3, s3, 0xff
+; GFX10-NEXT: s_lshl_b32 s1, s1, 8
; GFX10-NEXT: s_and_b32 s10, s10, 0xff
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s11, s2, 16
; GFX10-NEXT: s_lshr_b32 s12, s2, 24
-; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: s_and_b32 s9, s9, 0xff
+; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: s_and_b32 s13, s13, 0xff
-; GFX10-NEXT: s_lshr_b32 s8, s0, 24
-; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s11, s11, 0xff
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT: s_or_b32 s1, s8, s1
+; GFX10-NEXT: s_lshl_b32 s8, s10, 8
; GFX10-NEXT: s_or_b32 s3, s12, s3
+; GFX10-NEXT: s_or_b32 s2, s2, s8
; GFX10-NEXT: s_lshr_b32 s6, s0, 8
-; GFX10-NEXT: s_or_b32 s1, s8, s1
-; GFX10-NEXT: s_lshl_b32 s8, s11, 16
-; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
; GFX10-NEXT: s_lshr_b32 s7, s0, 16
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT: s_and_b32 s6, s6, 0xff
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
-; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_and_b32 s7, s7, 0xff
-; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24
-; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT: s_or_b32 s0, s0, s6
+; GFX10-NEXT: s_and_b32 s9, s9, 0xff
+; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0
+; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_lshl_b32 s7, s7, 16
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: s_or_b32 s0, s0, s6
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: s_or_b32 s0, s0, s7
+; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24
+; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT: s_lshl_b32 s5, s10, 8
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s9
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s13
+; GFX10-NEXT: s_and_b32 s4, s11, 0xff
+; GFX10-NEXT: s_and_b32 s5, s13, 0xff
+; GFX10-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
-; GFX10-NEXT: s_or_b32 s2, s2, s5
-; GFX10-NEXT: s_lshl_b32 s5, s9, 16
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s5, s5, 16
+; GFX10-NEXT: s_or_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s3, s3, s5
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT: s_or_b32 s3, s3, s5
-; GFX10-NEXT: s_or_b32 s2, s2, s8
; GFX10-NEXT: s_lshr_b32 s3, s3, 1
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
-; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
@@ -2288,7 +2178,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2
; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
-; GFX10-NEXT: s_lshl_b32 s2, s4, 16
+; GFX10-NEXT: s_lshl_b32 s2, s9, 16
; GFX10-NEXT: s_or_b32 s1, s1, s2
; GFX10-NEXT: v_lshl_or_b32 v2, s0, v2, v3
; GFX10-NEXT: v_lshrrev_b32_e64 v4, v4, s3
@@ -2310,125 +2200,120 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-LABEL: s_fshl_v2i24:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX11-NEXT: s_lshr_b32 s14, s4, 8
-; GFX11-NEXT: s_lshr_b32 s15, s4, 16
-; GFX11-NEXT: s_and_b32 s14, s14, 0xff
-; GFX11-NEXT: s_lshr_b32 s16, s4, 24
+; GFX11-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-NEXT: s_lshr_b32 s11, s4, 16
+; GFX11-NEXT: s_and_b32 s10, s10, 0xff
+; GFX11-NEXT: s_lshr_b32 s12, s4, 24
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
-; GFX11-NEXT: s_lshl_b32 s14, s14, 8
-; GFX11-NEXT: s_and_b32 s15, s15, 0xff
-; GFX11-NEXT: s_or_b32 s4, s4, s14
-; GFX11-NEXT: s_lshr_b32 s17, s5, 8
+; GFX11-NEXT: s_and_b32 s11, s11, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s10, 8
+; GFX11-NEXT: s_lshl_b32 s11, s11, 16
+; GFX11-NEXT: s_or_b32 s4, s4, s10
+; GFX11-NEXT: s_lshr_b32 s13, s5, 8
; GFX11-NEXT: s_and_b32 s5, s5, 0xff
-; GFX11-NEXT: s_lshl_b32 s15, s15, 16
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_or_b32 s4, s4, s11
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_and_b32 s17, s17, 0xff
-; GFX11-NEXT: s_or_b32 s4, s4, s15
-; GFX11-NEXT: s_or_b32 s5, s16, s5
-; GFX11-NEXT: s_and_b32 s16, 0xffff, s17
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT: s_lshl_b32 s14, s16, 16
+; GFX11-NEXT: s_and_b32 s13, s13, 0xff
+; GFX11-NEXT: s_or_b32 s5, s12, s5
+; GFX11-NEXT: s_lshl_b32 s10, s13, 16
; GFX11-NEXT: s_lshr_b32 s9, s1, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s14
-; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT: s_or_b32 s5, s5, s10
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshr_b32 s10, s2, 8
; GFX11-NEXT: s_lshr_b32 s8, s0, 24
+; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT: s_lshr_b32 s11, s2, 16
; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s9, s9, 0xff
; GFX11-NEXT: s_and_b32 s10, s10, 0xff
-; GFX11-NEXT: s_lshr_b32 s11, s2, 16
; GFX11-NEXT: s_lshr_b32 s12, s2, 24
-; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: s_or_b32 s1, s8, s1
-; GFX11-NEXT: s_lshl_b32 s8, s10, 8
-; GFX11-NEXT: s_and_b32 s11, s11, 0xff
-; GFX11-NEXT: s_or_b32 s2, s2, s8
+; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
+; GFX11-NEXT: s_lshl_b32 s8, s9, 16
+; GFX11-NEXT: s_lshl_b32 s9, s10, 8
; GFX11-NEXT: s_lshr_b32 s6, s0, 8
+; GFX11-NEXT: s_or_b32 s2, s2, s9
; GFX11-NEXT: s_lshr_b32 s13, s3, 8
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshr_b32 s7, s0, 16
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_and_b32 s9, s9, 0xff
-; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_and_b32 s13, s13, 0xff
-; GFX11-NEXT: s_lshr_b32 s7, s0, 16
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_and_b32 s7, s7, 0xff
+; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0
+; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX11-NEXT: s_lshl_b32 s6, s6, 8
; GFX11-NEXT: s_or_b32 s3, s12, s3
+; GFX11-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-NEXT: s_or_b32 s0, s0, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_or_b32 s0, s0, s7
; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT: s_and_b32 s7, s7, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s6
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX11-NEXT: s_lshl_b32 s5, s11, 16
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s9
-; GFX11-NEXT: s_or_b32 s2, s2, s5
+; GFX11-NEXT: s_and_b32 s4, s11, 0xff
+; GFX11-NEXT: s_and_b32 s5, s13, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
-; GFX11-NEXT: s_and_b32 s9, 0xffff, s13
+; GFX11-NEXT: s_or_b32 s2, s2, s4
+; GFX11-NEXT: s_lshl_b32 s5, s5, 16
; GFX11-NEXT: s_lshr_b32 s2, s2, 1
-; GFX11-NEXT: s_lshl_b32 s6, s9, 16
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT: s_or_b32 s3, s3, s6
-; GFX11-NEXT: s_or_b32 s0, s0, s7
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: s_or_b32 s3, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2
; GFX11-NEXT: s_lshr_b32 s2, s3, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2
-; GFX11-NEXT: s_or_b32 s0, s1, s4
+; GFX11-NEXT: s_or_b32 s0, s1, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
-; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s2
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v3
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5
-; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i48 %lhs.arg to <2 x i24>
@@ -3295,9 +3180,7 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s3, s2, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
; GFX6-NEXT: s_bfe_u32 s1, s1, 0xf0001
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_lshl_b32 s0, s0, s3
; GFX6-NEXT: s_lshr_b32 s1, s1, s2
; GFX6-NEXT: s_or_b32 s0, s0, s1
@@ -3305,12 +3188,10 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX8-LABEL: s_fshl_i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s3, s2, 15
; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -3318,12 +3199,10 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX9-LABEL: s_fshl_i16:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s3, s2, 15
; GFX9-NEXT: s_andn2_b32 s2, 15, s2
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -3331,12 +3210,10 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX10-LABEL: s_fshl_i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_and_b32 s3, s2, 15
; GFX10-NEXT: s_andn2_b32 s2, 15, s2
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -3344,12 +3221,10 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX11-LABEL: s_fshl_i16:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_b32 s3, s2, 15
; GFX11-NEXT: s_and_not1_b32 s2, 15, s2
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3679,9 +3554,7 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s2, s1, 15
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_lshl_b32 s0, s0, s2
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3691,7 +3564,6 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s2, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0
@@ -3702,7 +3574,6 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s2, s1, 15
; GFX9-NEXT: s_andn2_b32 s1, 15, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: v_lshrrev_b16_e32 v0, s1, v0
@@ -3714,9 +3585,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX10-NEXT: v_lshrrev_b16 v0, 1, v0
; GFX10-NEXT: s_andn2_b32 s2, 15, s1
; GFX10-NEXT: s_and_b32 s1, s1, 15
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshl_b32 s0, s0, s1
+; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
@@ -3726,10 +3596,9 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX11-NEXT: s_and_not1_b32 s2, 15, s1
; GFX11-NEXT: s_and_b32 s1, s1, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX11-NEXT: s_lshl_b32 s0, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
; GFX11-NEXT: ; return to shader part epilog
%result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
@@ -3742,9 +3611,7 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s2, s1, 15
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX6-NEXT: s_lshr_b32 s0, s0, s1
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3752,11 +3619,10 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX8-LABEL: v_fshl_i16_vss:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_and_b32 s2, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b16_e32 v0, s2, v0
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3764,11 +3630,10 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX9-LABEL: v_fshl_i16_vss:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: s_and_b32 s2, s1, 15
; GFX9-NEXT: s_andn2_b32 s1, 15, s1
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: s_lshr_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: v_lshlrev_b16_e32 v0, s2, v0
; GFX9-NEXT: s_lshr_b32 s0, s0, s1
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3777,11 +3642,10 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX10-LABEL: v_fshl_i16_vss:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s2, s1, 15
-; GFX10-NEXT: s_andn2_b32 s1, 15, s1
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX10-NEXT: s_andn2_b32 s1, 15, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
@@ -3789,11 +3653,10 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX11-LABEL: v_fshl_i16_vss:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s2, s1, 15
-; GFX11-NEXT: s_and_not1_b32 s1, 15, s1
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT: s_and_not1_b32 s1, 15, s1
; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -3809,19 +3672,15 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s6, s4, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s4
-; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshl_b32 s0, s0, s6
; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s2, s5, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s5
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s4
-; GFX6-NEXT: s_lshr_b32 s2, s2, s3
+; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3832,13 +3691,11 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX8-LABEL: s_fshl_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s5, s2, 16
; GFX8-NEXT: s_and_b32 s6, s2, 15
; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
@@ -3847,7 +3704,6 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_andn2_b32 s2, 15, s5
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
; GFX8-NEXT: s_lshr_b32 s3, s4, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s2, s3, s2
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4189,19 +4045,15 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_lshl_b32 s0, s0, s4
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s3, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s3
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX6-NEXT: s_lshl_b32 s0, s1, s0
; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
+; GFX6-NEXT: s_lshl_b32 s0, s1, s0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s2, v1
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -4211,10 +4063,9 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
;
; GFX8-LABEL: v_fshl_v2i16_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s4, s1, 15
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
+; GFX8-NEXT: s_and_b32 s4, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
@@ -4285,19 +4136,15 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s3, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s3
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s2
-; GFX6-NEXT: s_lshr_b32 s0, s0, s1
+; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -4308,12 +4155,11 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX8-LABEL: v_fshl_v2i16_vss:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s4, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
@@ -4321,7 +4167,6 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_andn2_b32 s1, 15, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_lshr_b32 s0, s2, 1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
@@ -4398,26 +4243,20 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s9, s6, 15
; GFX6-NEXT: s_andn2_b32 s6, 15, s6
-; GFX6-NEXT: s_and_b32 s9, 0xffff, s9
; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
; GFX6-NEXT: s_lshl_b32 s0, s0, s9
; GFX6-NEXT: s_lshr_b32 s3, s3, s6
; GFX6-NEXT: s_or_b32 s0, s0, s3
; GFX6-NEXT: s_and_b32 s3, s7, 15
; GFX6-NEXT: s_andn2_b32 s6, 15, s7
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
; GFX6-NEXT: s_lshl_b32 s1, s1, s3
; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s6
-; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_lshr_b32 s3, s3, s6
; GFX6-NEXT: s_or_b32 s1, s1, s3
; GFX6-NEXT: s_and_b32 s3, s8, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s8
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
; GFX6-NEXT: s_lshl_b32 s2, s2, s3
; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshr_b32 s3, s3, s4
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_or_b32 s2, s2, s3
@@ -4430,13 +4269,11 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-LABEL: s_fshl_v3i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, s4
@@ -4445,17 +4282,14 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_andn2_b32 s4, 15, s8
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_lshr_b32 s6, s7, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s4, s5, 15
; GFX8-NEXT: s_andn2_b32 s5, 15, s5
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_lshl_b32 s1, s1, s4
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, s4
+; GFX8-NEXT: s_lshl_b32 s1, s1, s4
+; GFX8-NEXT: s_lshr_b32 s3, s3, s5
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_or_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
@@ -4743,34 +4577,26 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_and_b32 s12, s8, 15
; GFX6-NEXT: s_andn2_b32 s8, 15, s8
-; GFX6-NEXT: s_and_b32 s12, 0xffff, s12
; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001
-; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
; GFX6-NEXT: s_lshl_b32 s0, s0, s12
; GFX6-NEXT: s_lshr_b32 s4, s4, s8
; GFX6-NEXT: s_or_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s4, s9, 15
; GFX6-NEXT: s_andn2_b32 s8, 15, s9
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, s4
; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001
-; GFX6-NEXT: s_and_b32 s5, 0xffff, s8
-; GFX6-NEXT: s_lshr_b32 s4, s4, s5
+; GFX6-NEXT: s_lshr_b32 s4, s4, s8
; GFX6-NEXT: s_or_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s4, s10, 15
; GFX6-NEXT: s_andn2_b32 s5, 15, s10
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshl_b32 s2, s2, s4
; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001
-; GFX6-NEXT: s_and_b32 s5, 0xffff, s5
; GFX6-NEXT: s_lshr_b32 s4, s4, s5
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_and_b32 s4, s11, 15
; GFX6-NEXT: s_andn2_b32 s5, 15, s11
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshl_b32 s3, s3, s4
; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001
-; GFX6-NEXT: s_and_b32 s5, 0xffff, s5
; GFX6-NEXT: s_lshr_b32 s4, s4, s5
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_or_b32 s3, s3, s4
@@ -4786,13 +4612,11 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-LABEL: s_fshl_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s8, s2, 16
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
; GFX8-NEXT: s_and_b32 s12, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s2, s2, s4
@@ -4801,25 +4625,21 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_andn2_b32 s4, 15, s10
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_lshr_b32 s6, s8, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_lshr_b32 s9, s3, 16
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshr_b32 s11, s5, 16
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s4, s5, 15
-; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
; GFX8-NEXT: s_andn2_b32 s5, 15, s5
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, s4
-; GFX8-NEXT: s_andn2_b32 s4, 15, s11
+; GFX8-NEXT: s_lshr_b32 s3, s3, s5
; GFX8-NEXT: s_or_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s3, s11, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s11
; GFX8-NEXT: s_lshr_b32 s5, s9, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s3, s7, s3
; GFX8-NEXT: s_lshr_b32 s4, s5, s4
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 13011ecaceb3e..2e8c918e4c67e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -347,13 +347,10 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshr_i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_andn2_b32 s3, 7, s2
; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -361,13 +358,10 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX9-LABEL: s_fshr_i8:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_andn2_b32 s3, 7, s2
; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -375,13 +369,10 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX10-LABEL: s_fshr_i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
; GFX10-NEXT: s_andn2_b32 s3, 7, s2
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s2, s2, 7
-; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -389,13 +380,10 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX11-LABEL: s_fshr_i8:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_and_not1_b32 s3, 7, s2
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s2, s2, 7
-; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -485,7 +473,6 @@ define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-LABEL: s_fshr_i8_4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: s_lshr_b32 s1, s1, 4
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -494,7 +481,6 @@ define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-LABEL: s_fshr_i8_4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
; GFX9-NEXT: s_lshr_b32 s1, s1, 4
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -504,7 +490,6 @@ define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 4
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -513,9 +498,8 @@ define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4)
@@ -582,7 +566,6 @@ define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-LABEL: s_fshr_i8_5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: s_lshr_b32 s1, s1, 5
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -591,7 +574,6 @@ define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-LABEL: s_fshr_i8_5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 3
; GFX9-NEXT: s_lshr_b32 s1, s1, 5
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -601,7 +583,6 @@ define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s1, s1, 5
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -610,9 +591,8 @@ define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 3
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s1, s1, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5)
@@ -695,27 +675,21 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX8-LABEL: s_fshr_v2i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s1, 8
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_andn2_b32 s6, 7, s2
; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s3, s0, 8
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_andn2_b32 s2, 7, s5
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s3, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s5
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s5, 7
; GFX8-NEXT: s_and_b32 s3, s4, 0xff
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s2, s3, s2
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
@@ -726,27 +700,21 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX9-LABEL: s_fshr_v2i8:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s1, 8
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_andn2_b32 s6, 7, s2
; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshr_b32 s3, s0, 8
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_andn2_b32 s2, 7, s5
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_lshl_b32 s1, s3, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s5
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s5, 7
; GFX9-NEXT: s_and_b32 s3, s4, 0xff
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s2, s3, s2
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
@@ -757,24 +725,18 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX10-LABEL: s_fshr_v2i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s5, 7, s2
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_lshr_b32 s4, s1, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_andn2_b32 s5, 7, s2
; GFX10-NEXT: s_lshr_b32 s6, s2, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, s5
+; GFX10-NEXT: s_lshl_b32 s3, s3, 1
; GFX10-NEXT: s_andn2_b32 s5, 7, s6
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
; GFX10-NEXT: s_and_b32 s6, s6, 7
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_and_b32 s4, s4, 0xff
; GFX10-NEXT: s_and_b32 s2, s2, 7
-; GFX10-NEXT: s_lshl_b32 s3, s3, 1
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s3, s3, s5
; GFX10-NEXT: s_lshr_b32 s4, s4, s6
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
@@ -788,24 +750,18 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX11-LABEL: s_fshr_v2i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s2
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_lshr_b32 s4, s1, 8
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s2
; GFX11-NEXT: s_lshr_b32 s6, s2, 8
; GFX11-NEXT: s_lshl_b32 s0, s0, s5
+; GFX11-NEXT: s_lshl_b32 s3, s3, 1
; GFX11-NEXT: s_and_not1_b32 s5, 7, s6
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_and_b32 s6, s6, 7
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_and_b32 s2, s2, 7
-; GFX11-NEXT: s_lshl_b32 s3, s3, 1
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s4, s4, s6
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
@@ -1021,46 +977,38 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX8-LABEL: s_fshr_v4i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s3, s0, 8
+; GFX8-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NEXT: s_lshr_b32 s5, s0, 24
; GFX8-NEXT: s_lshr_b32 s6, s1, 8
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_lshr_b32 s8, s1, 24
; GFX8-NEXT: s_lshr_b32 s9, s2, 8
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_andn2_b32 s12, 7, s2
; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s3, s0, 8
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_lshr_b32 s5, s0, 24
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_andn2_b32 s2, 7, s9
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s3, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s2, 7, s9
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s9, 7
; GFX8-NEXT: s_and_b32 s3, s6, 0xff
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s10
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_lshl_b32 s2, s4, 1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_andn2_b32 s3, 7, s10
; GFX8-NEXT: s_lshl_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, s10, 7
; GFX8-NEXT: s_and_b32 s4, s7, 0xff
; GFX8-NEXT: s_lshr_b32 s3, s4, s3
-; GFX8-NEXT: s_andn2_b32 s4, 7, s11
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_andn2_b32 s4, 7, s11
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshl_b32 s3, s3, s4
; GFX8-NEXT: s_and_b32 s4, s11, 7
@@ -1079,46 +1027,38 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX9-LABEL: s_fshr_v4i8:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshr_b32 s3, s0, 8
+; GFX9-NEXT: s_lshr_b32 s4, s0, 16
+; GFX9-NEXT: s_lshr_b32 s5, s0, 24
; GFX9-NEXT: s_lshr_b32 s6, s1, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_lshr_b32 s8, s1, 24
; GFX9-NEXT: s_lshr_b32 s9, s2, 8
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_andn2_b32 s12, 7, s2
; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshr_b32 s3, s0, 8
-; GFX9-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-NEXT: s_lshr_b32 s5, s0, 24
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s12
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_andn2_b32 s2, 7, s9
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_lshl_b32 s1, s3, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s2, 7, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s9, 7
; GFX9-NEXT: s_and_b32 s3, s6, 0xff
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s2, s3, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s10
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_lshl_b32 s2, s4, 1
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s3, 7, s10
; GFX9-NEXT: s_lshl_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s3, s10, 7
; GFX9-NEXT: s_and_b32 s4, s7, 0xff
; GFX9-NEXT: s_lshr_b32 s3, s4, s3
-; GFX9-NEXT: s_andn2_b32 s4, 7, s11
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s3, s5, 1
-; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT: s_andn2_b32 s4, 7, s11
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_and_b32 s4, s11, 7
@@ -1137,6 +1077,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX10-LABEL: s_fshr_v4i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_lshr_b32 s6, s1, 8
; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
@@ -1144,40 +1085,31 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_lshr_b32 s10, s2, 16
; GFX10-NEXT: s_lshr_b32 s11, s2, 24
; GFX10-NEXT: s_andn2_b32 s12, 7, s2
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s2, s2, 7
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_andn2_b32 s2, 7, s9
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
-; GFX10-NEXT: s_and_b32 s9, s9, 7
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX10-NEXT: s_lshl_b32 s3, s3, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 1
+; GFX10-NEXT: s_andn2_b32 s3, 7, s9
+; GFX10-NEXT: s_and_b32 s9, s9, 7
+; GFX10-NEXT: s_and_b32 s6, s6, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s2, s3, s2
+; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshr_b32 s3, s6, s9
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: s_or_b32 s1, s2, s3
-; GFX10-NEXT: s_andn2_b32 s2, 7, s10
-; GFX10-NEXT: s_lshl_b32 s3, s4, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s2, s4, 1
+; GFX10-NEXT: s_andn2_b32 s3, 7, s10
; GFX10-NEXT: s_and_b32 s4, s10, 7
; GFX10-NEXT: s_and_b32 s6, s7, 0xff
-; GFX10-NEXT: s_lshl_b32 s2, s3, s2
+; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshr_b32 s3, s6, s4
-; GFX10-NEXT: s_andn2_b32 s4, 7, s11
-; GFX10-NEXT: s_lshl_b32 s5, s5, 1
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_lshl_b32 s4, s5, 1
+; GFX10-NEXT: s_andn2_b32 s5, 7, s11
; GFX10-NEXT: s_and_b32 s6, s11, 7
-; GFX10-NEXT: s_lshl_b32 s4, s5, s4
+; GFX10-NEXT: s_lshl_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s5, s8, s6
; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
@@ -1195,6 +1127,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX11-LABEL: s_fshr_v4i8:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshr_b32 s8, s1, 24
@@ -1202,40 +1135,31 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_lshr_b32 s10, s2, 16
; GFX11-NEXT: s_lshr_b32 s11, s2, 24
; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s2, s2, 7
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s9
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_and_b32 s9, s9, 7
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX11-NEXT: s_lshl_b32 s3, s3, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_lshl_b32 s2, s3, 1
+; GFX11-NEXT: s_and_not1_b32 s3, 7, s9
+; GFX11-NEXT: s_and_b32 s9, s9, 7
+; GFX11-NEXT: s_and_b32 s6, s6, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s2, s3, s2
+; GFX11-NEXT: s_lshl_b32 s2, s2, s3
; GFX11-NEXT: s_lshr_b32 s3, s6, s9
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s10
-; GFX11-NEXT: s_lshl_b32 s3, s4, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshl_b32 s2, s4, 1
+; GFX11-NEXT: s_and_not1_b32 s3, 7, s10
; GFX11-NEXT: s_and_b32 s4, s10, 7
; GFX11-NEXT: s_and_b32 s6, s7, 0xff
-; GFX11-NEXT: s_lshl_b32 s2, s3, s2
+; GFX11-NEXT: s_lshl_b32 s2, s2, s3
; GFX11-NEXT: s_lshr_b32 s3, s6, s4
-; GFX11-NEXT: s_and_not1_b32 s4, 7, s11
-; GFX11-NEXT: s_lshl_b32 s5, s5, 1
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_lshl_b32 s4, s5, 1
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s11
; GFX11-NEXT: s_and_b32 s6, s11, 7
-; GFX11-NEXT: s_lshl_b32 s4, s5, s4
+; GFX11-NEXT: s_lshl_b32 s4, s4, s5
; GFX11-NEXT: s_lshr_b32 s5, s8, s6
; GFX11-NEXT: s_or_b32 s2, s2, s3
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
@@ -1872,23 +1796,22 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008
; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
-; GFX6-NEXT: s_lshr_b32 s6, s0, 16
; GFX6-NEXT: s_or_b32 s8, s8, s9
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_and_b32 s0, s7, 0xff
; GFX6-NEXT: s_lshr_b32 s1, s2, 16
-; GFX6-NEXT: s_lshr_b32 s7, s3, 8
; GFX6-NEXT: s_and_b32 s9, s2, 0xff
; GFX6-NEXT: s_lshl_b32 s10, s10, 8
+; GFX6-NEXT: s_lshr_b32 s6, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s7, 0xff
+; GFX6-NEXT: s_lshr_b32 s7, s3, 8
; GFX6-NEXT: s_or_b32 s9, s9, s10
; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: s_and_b32 s3, s3, 0xff
; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: s_and_b32 s2, s7, 0xff
; GFX6-NEXT: s_and_b32 s9, 0xffff, s9
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX6-NEXT: s_and_b32 s2, s7, 0xff
; GFX6-NEXT: s_or_b32 s1, s9, s1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
@@ -1908,9 +1831,8 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: s_lshr_b32 s3, s5, 8
; GFX6-NEXT: s_and_b32 s5, s5, 0xff
; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: s_and_b32 s3, s3, 0xff
; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX6-NEXT: s_and_b32 s3, s3, 0xff
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
@@ -1936,7 +1858,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: s_lshl_b32 s3, s8, 1
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6
; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4
@@ -1980,15 +1901,15 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_lshr_b32 s6, s0, 8
; GFX8-NEXT: s_lshr_b32 s8, s0, 24
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: s_and_b32 s6, s6, 0xff
; GFX8-NEXT: s_or_b32 s1, s8, s1
; GFX8-NEXT: s_lshr_b32 s8, s2, 8
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: s_lshr_b32 s7, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
; GFX8-NEXT: s_and_b32 s8, s8, 0xff
-; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: s_or_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
; GFX8-NEXT: s_and_b32 s7, s9, 0xff
@@ -1998,18 +1919,15 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_lshl_b32 s8, s8, 8
; GFX8-NEXT: s_or_b32 s2, s2, s8
; GFX8-NEXT: s_and_b32 s8, s9, 0xff
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_lshr_b32 s11, s3, 8
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s8, s8, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xff
-; GFX8-NEXT: v_not_b32_e32 v1, 23
+; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX8-NEXT: s_or_b32 s2, s2, s8
; GFX8-NEXT: s_lshl_b32 s3, s3, 8
; GFX8-NEXT: s_and_b32 s8, s11, 0xff
-; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX8-NEXT: s_or_b32 s3, s10, s3
-; GFX8-NEXT: s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s8, s8, 16
; GFX8-NEXT: s_or_b32 s3, s3, s8
; GFX8-NEXT: s_lshr_b32 s8, s4, 8
@@ -2021,7 +1939,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_lshl_b32 s8, s8, 8
; GFX8-NEXT: s_or_b32 s4, s4, s8
; GFX8-NEXT: s_and_b32 s8, s9, 0xff
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s8, s8, 16
; GFX8-NEXT: s_or_b32 s4, s4, s8
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@@ -2032,8 +1949,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24
; GFX8-NEXT: s_and_b32 s8, s11, 0xff
; GFX8-NEXT: s_or_b32 s5, s10, s5
-; GFX8-NEXT: s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
; GFX8-NEXT: s_lshl_b32 s8, s8, 16
; GFX8-NEXT: s_or_b32 s5, s5, s8
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2
@@ -2044,7 +1959,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2
; GFX8-NEXT: s_lshl_b32 s4, s6, 17
@@ -2061,8 +1975,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0
; GFX8-NEXT: s_lshl_b32 s0, s7, 17
@@ -2097,15 +2009,15 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshr_b32 s6, s0, 8
; GFX9-NEXT: s_lshr_b32 s8, s0, 24
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_and_b32 s6, s6, 0xff
; GFX9-NEXT: s_or_b32 s1, s8, s1
; GFX9-NEXT: s_lshr_b32 s8, s2, 8
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: s_lshr_b32 s7, s0, 16
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_and_b32 s8, s8, 0xff
-; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: s_or_b32 s0, s0, s6
; GFX9-NEXT: s_and_b32 s6, s7, 0xff
; GFX9-NEXT: s_and_b32 s7, s9, 0xff
@@ -2115,18 +2027,15 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshl_b32 s8, s8, 8
; GFX9-NEXT: s_or_b32 s2, s2, s8
; GFX9-NEXT: s_and_b32 s8, s9, 0xff
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GFX9-NEXT: s_lshr_b32 s11, s3, 8
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
; GFX9-NEXT: s_and_b32 s3, s3, 0xff
-; GFX9-NEXT: v_not_b32_e32 v1, 23
+; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX9-NEXT: s_or_b32 s2, s2, s8
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_and_b32 s8, s11, 0xff
-; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX9-NEXT: s_or_b32 s3, s10, s3
-; GFX9-NEXT: s_and_b32 s8, 0xffff, s8
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
; GFX9-NEXT: s_or_b32 s3, s3, s8
; GFX9-NEXT: s_lshr_b32 s8, s4, 8
@@ -2138,7 +2047,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshl_b32 s8, s8, 8
; GFX9-NEXT: s_or_b32 s4, s4, s8
; GFX9-NEXT: s_and_b32 s8, s9, 0xff
-; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
; GFX9-NEXT: s_or_b32 s4, s4, s8
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
@@ -2148,8 +2056,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_and_b32 s8, s11, 0xff
; GFX9-NEXT: s_or_b32 s5, s10, s5
-; GFX9-NEXT: s_and_b32 s8, 0xffff, s8
-; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24
; GFX9-NEXT: s_lshl_b32 s8, s8, 16
; GFX9-NEXT: s_or_b32 s5, s5, s8
@@ -2161,7 +2067,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1
; GFX9-NEXT: s_lshl_b32 s4, s6, 17
@@ -2177,8 +2082,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s7, 0xffff, s7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0
; GFX9-NEXT: s_lshl_b32 s0, s7, 17
@@ -2207,115 +2110,106 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-LABEL: s_fshr_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT: s_lshr_b32 s14, s4, 8
-; GFX10-NEXT: s_lshr_b32 s15, s4, 16
-; GFX10-NEXT: s_and_b32 s14, s14, 0xff
-; GFX10-NEXT: s_lshr_b32 s16, s4, 24
+; GFX10-NEXT: s_lshr_b32 s13, s4, 8
+; GFX10-NEXT: s_lshr_b32 s14, s4, 16
+; GFX10-NEXT: s_and_b32 s13, s13, 0xff
+; GFX10-NEXT: s_lshr_b32 s15, s4, 24
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
-; GFX10-NEXT: s_lshl_b32 s14, s14, 8
-; GFX10-NEXT: s_and_b32 s15, s15, 0xff
-; GFX10-NEXT: s_or_b32 s4, s4, s14
-; GFX10-NEXT: s_lshr_b32 s17, s5, 8
+; GFX10-NEXT: s_and_b32 s14, s14, 0xff
+; GFX10-NEXT: s_lshl_b32 s13, s13, 8
+; GFX10-NEXT: s_lshl_b32 s14, s14, 16
+; GFX10-NEXT: s_or_b32 s4, s4, s13
+; GFX10-NEXT: s_lshr_b32 s16, s5, 8
; GFX10-NEXT: s_and_b32 s5, s5, 0xff
-; GFX10-NEXT: s_lshl_b32 s15, s15, 16
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_or_b32 s4, s4, s14
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT: s_and_b32 s14, s17, 0xff
-; GFX10-NEXT: s_or_b32 s4, s4, s15
-; GFX10-NEXT: s_or_b32 s5, s16, s5
-; GFX10-NEXT: s_and_b32 s14, 0xffff, s14
+; GFX10-NEXT: s_and_b32 s16, s16, 0xff
+; GFX10-NEXT: s_or_b32 s5, s15, s5
+; GFX10-NEXT: s_lshl_b32 s13, s16, 16
+; GFX10-NEXT: s_lshr_b32 s10, s2, 8
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT: s_lshl_b32 s14, s14, 16
+; GFX10-NEXT: s_or_b32 s5, s5, s13
; GFX10-NEXT: s_lshr_b32 s9, s1, 8
-; GFX10-NEXT: s_or_b32 s5, s5, s14
-; GFX10-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s10, s2, 8
-; GFX10-NEXT: s_lshr_b32 s8, s0, 24
-; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s9, s9, 0xff
+; GFX10-NEXT: s_lshr_b32 s11, s2, 16
+; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0
+; GFX10-NEXT: s_lshr_b32 s13, s3, 8
+; GFX10-NEXT: s_and_b32 s3, s3, 0xff
; GFX10-NEXT: s_and_b32 s10, s10, 0xff
; GFX10-NEXT: s_lshr_b32 s6, s0, 8
-; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT: s_lshr_b32 s11, s2, 16
+; GFX10-NEXT: s_lshr_b32 s8, s0, 24
; GFX10-NEXT: s_lshr_b32 s12, s2, 24
-; GFX10-NEXT: s_lshr_b32 s13, s3, 8
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: s_and_b32 s3, s3, 0xff
-; GFX10-NEXT: s_or_b32 s1, s8, s1
-; GFX10-NEXT: s_and_b32 s8, 0xffff, s9
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT: s_lshl_b32 s9, s10, 8
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 8
; GFX10-NEXT: s_and_b32 s11, s11, 0xff
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX10-NEXT: s_and_b32 s13, s13, 0xff
-; GFX10-NEXT: s_or_b32 s2, s2, s9
+; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_or_b32 s1, s8, s1
+; GFX10-NEXT: s_or_b32 s3, s12, s3
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT: s_lshl_b32 s8, s13, 16
; GFX10-NEXT: s_lshr_b32 s7, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
-; GFX10-NEXT: s_or_b32 s3, s12, s3
-; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24
-; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT: s_and_b32 s10, 0xffff, s13
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_or_b32 s0, s0, s6
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0
+; GFX10-NEXT: s_or_b32 s3, s3, s8
; GFX10-NEXT: s_and_b32 s7, s7, 0xff
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT: s_lshl_b32 s4, s11, 16
-; GFX10-NEXT: s_lshl_b32 s5, s10, 16
-; GFX10-NEXT: s_or_b32 s2, s2, s4
-; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT: s_or_b32 s3, s3, s5
+; GFX10-NEXT: s_and_b32 s9, s9, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s6
; GFX10-NEXT: s_lshl_b32 s7, s7, 17
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_lshl_b32 s9, s9, 17
+; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24
+; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v0
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT: s_or_b32 s0, s7, s0
; GFX10-NEXT: s_lshl_b32 s1, s1, 1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: s_or_b32 s0, s7, s0
+; GFX10-NEXT: s_or_b32 s1, s9, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, 8
+; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0
+; GFX10-NEXT: s_lshl_b32 s4, s10, 8
+; GFX10-NEXT: s_lshl_b32 s5, s11, 16
+; GFX10-NEXT: s_or_b32 s2, s2, s4
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
+; GFX10-NEXT: s_or_b32 s2, s2, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v3
-; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s2
-; GFX10-NEXT: s_lshl_b32 s2, s8, 17
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s3
-; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1
-; GFX10-NEXT: s_or_b32 s0, s2, s1
-; GFX10-NEXT: v_mov_b32_e32 v2, 8
-; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0
+; GFX10-NEXT: v_lshl_or_b32 v2, s0, v3, v2
; GFX10-NEXT: v_mov_b32_e32 v3, 16
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT: v_lshl_or_b32 v0, s1, v4, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0
-; GFX10-NEXT: v_and_or_b32 v2, 0xff, v1, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4
-; GFX10-NEXT: v_bfe_u32 v4, v0, 8, 8
+; GFX10-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1
; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX10-NEXT: v_or3_b32 v1, v2, v1, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v4
-; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v5
+; GFX10-NEXT: v_or3_b32 v1, v1, v3, v2
; GFX10-NEXT: v_readfirstlane_b32 s1, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_v2i24:
@@ -2327,116 +2221,112 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_lshr_b32 s16, s4, 24
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
-; GFX11-NEXT: s_lshl_b32 s14, s14, 8
; GFX11-NEXT: s_and_b32 s15, s15, 0xff
+; GFX11-NEXT: s_lshl_b32 s14, s14, 8
+; GFX11-NEXT: s_lshl_b32 s15, s15, 16
; GFX11-NEXT: s_or_b32 s4, s4, s14
; GFX11-NEXT: s_lshr_b32 s17, s5, 8
; GFX11-NEXT: s_and_b32 s5, s5, 0xff
-; GFX11-NEXT: s_lshl_b32 s15, s15, 16
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_or_b32 s4, s4, s15
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_and_b32 s14, s17, 0xff
-; GFX11-NEXT: s_or_b32 s4, s4, s15
+; GFX11-NEXT: s_and_b32 s17, s17, 0xff
; GFX11-NEXT: s_or_b32 s5, s16, s5
-; GFX11-NEXT: s_and_b32 s14, 0xffff, s14
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT: s_lshl_b32 s14, s14, 16
+; GFX11-NEXT: s_lshl_b32 s14, s17, 16
; GFX11-NEXT: s_lshr_b32 s10, s2, 8
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_or_b32 s5, s5, s14
-; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT: s_lshr_b32 s11, s2, 16
; GFX11-NEXT: s_and_b32 s10, s10, 0xff
; GFX11-NEXT: s_lshr_b32 s6, s0, 8
-; GFX11-NEXT: s_lshr_b32 s9, s1, 8
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshr_b32 s11, s2, 16
+; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
; GFX11-NEXT: s_lshr_b32 s12, s2, 24
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_lshr_b32 s8, s0, 24
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s9, s9, 0xff
; GFX11-NEXT: s_and_b32 s11, s11, 0xff
+; GFX11-NEXT: s_and_b32 s6, s6, 0xff
; GFX11-NEXT: s_lshr_b32 s7, s0, 16
+; GFX11-NEXT: s_lshr_b32 s8, s0, 24
+; GFX11-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshr_b32 s13, s3, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0xff
; GFX11-NEXT: s_lshl_b32 s6, s6, 8
-; GFX11-NEXT: s_or_b32 s1, s8, s1
-; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX11-NEXT: s_and_b32 s8, 0xffff, s9
-; GFX11-NEXT: s_lshl_b32 s9, s11, 16
+; GFX11-NEXT: s_and_b32 s7, s7, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-NEXT: s_and_b32 s13, s13, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s6
-; GFX11-NEXT: s_and_b32 s7, s7, 0xff
-; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24
+; GFX11-NEXT: s_or_b32 s1, s8, s1
+; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0
+; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX11-NEXT: s_or_b32 s3, s12, s3
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT: s_lshl_b32 s8, s13, 16
; GFX11-NEXT: s_lshl_b32 s7, s7, 17
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_or_b32 s3, s3, s8
+; GFX11-NEXT: s_or_b32 s0, s7, s0
+; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24
+; GFX11-NEXT: s_and_b32 s9, s9, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 1
+; GFX11-NEXT: s_lshl_b32 s9, s9, 17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0
; GFX11-NEXT: s_lshl_b32 s4, s10, 8
-; GFX11-NEXT: s_and_b32 s10, 0xffff, s13
+; GFX11-NEXT: s_lshl_b32 s5, s11, 16
; GFX11-NEXT: s_or_b32 s2, s2, s4
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s4, s10, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s9
-; GFX11-NEXT: s_or_b32 s0, s7, s0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
+; GFX11-NEXT: s_or_b32 s2, s2, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT: s_lshl_b32 s5, s8, 17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_add_nc_u32 v3, 0xffffffe8, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1
+; GFX11-NEXT: s_or_b32 s0, s9, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1
+; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2
-; GFX11-NEXT: s_lshl_b32 s0, s1, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_or_b32 s0, s5, s0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT: v_lshl_or_b32 v0, s0, v3, v0
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8
-; GFX11-NEXT: v_lshl_or_b32 v0, s0, v2, v0
-; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5
; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i48 %lhs.arg to <2 x i24>
@@ -3037,11 +2927,9 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX6-NEXT: s_and_b32 s3, s2, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX6-NEXT: s_lshl_b32 s0, s0, s2
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s3
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX6-NEXT: s_lshr_b32 s1, s1, s2
+; GFX6-NEXT: s_lshl_b32 s0, s0, s2
+; GFX6-NEXT: s_lshr_b32 s1, s1, s3
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
@@ -3050,11 +2938,9 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX8-NEXT: s_and_b32 s3, s2, 15
; GFX8-NEXT: s_andn2_b32 s2, 15, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshl_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, s3
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
@@ -3063,11 +2949,9 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX9-NEXT: s_and_b32 s3, s2, 15
; GFX9-NEXT: s_andn2_b32 s2, 15, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s3
-; GFX9-NEXT: s_lshr_b32 s1, s1, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s2
+; GFX9-NEXT: s_lshr_b32 s1, s1, s3
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
@@ -3076,9 +2960,7 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX10-NEXT: s_and_b32 s3, s2, 15
; GFX10-NEXT: s_andn2_b32 s2, 15, s2
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s1, s1, s3
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -3089,9 +2971,7 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX11-NEXT: s_and_b32 s3, s2, 15
; GFX11-NEXT: s_and_not1_b32 s2, 15, s2
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshl_b32 s0, s0, s2
; GFX11-NEXT: s_lshr_b32 s1, s1, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3419,11 +3299,9 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX6-NEXT: s_and_b32 s2, s1, 15
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX6-NEXT: s_lshl_b32 s0, s0, s1
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0
+; GFX6-NEXT: s_lshl_b32 s0, s0, s1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
@@ -3432,7 +3310,6 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX8-NEXT: s_and_b32 s2, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
; GFX8-NEXT: v_lshrrev_b16_e32 v0, s2, v0
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3443,7 +3320,6 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX9-NEXT: s_and_b32 s2, s1, 15
; GFX9-NEXT: s_andn2_b32 s1, 15, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
; GFX9-NEXT: v_lshrrev_b16_e32 v0, s2, v0
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3455,7 +3331,6 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX10-NEXT: s_andn2_b32 s1, 15, s1
; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshl_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
@@ -3466,7 +3341,6 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX11-NEXT: s_and_not1_b32 s1, 15, s1
; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -3483,11 +3357,9 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX6-NEXT: s_and_b32 s2, s1, 15
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s1, v0
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX6-NEXT: s_lshr_b32 s0, s0, s1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s1, v0
+; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
@@ -3496,10 +3368,9 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX8-NEXT: s_and_b32 s2, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
-; GFX8-NEXT: s_lshr_b32 s0, s0, s1
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, s2
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -3508,10 +3379,9 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX9-NEXT: s_and_b32 s2, s1, 15
; GFX9-NEXT: s_andn2_b32 s1, 15, s1
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, s1, v0
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s2
-; GFX9-NEXT: s_lshr_b32 s0, s0, s1
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, s1, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, s2
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
@@ -3521,9 +3391,8 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX10-NEXT: s_andn2_b32 s2, 15, s1
; GFX10-NEXT: s_and_b32 s1, s1, 15
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
@@ -3533,10 +3402,9 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX11-NEXT: s_and_not1_b32 s2, 15, s1
; GFX11-NEXT: s_and_b32 s1, s1, 15
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0
; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
; GFX11-NEXT: ; return to shader part epilog
%result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
@@ -3557,15 +3425,13 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001
; GFX6-NEXT: s_lshl_b32 s1, s1, 1
; GFX6-NEXT: s_lshr_b32 s5, s5, 14
+; GFX6-NEXT: s_lshl_b32 s2, s2, 1
; GFX6-NEXT: s_xor_b32 s4, s4, -1
; GFX6-NEXT: s_or_b32 s1, s1, s5
-; GFX6-NEXT: s_lshl_b32 s2, s2, 1
; GFX6-NEXT: s_lshr_b32 s5, s4, 16
; GFX6-NEXT: s_and_b32 s6, s4, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s4
-; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshl_b32 s0, s0, s6
; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s3, s3, 1
@@ -3574,8 +3440,7 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX6-NEXT: s_andn2_b32 s4, 15, s5
; GFX6-NEXT: s_lshl_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s4
-; GFX6-NEXT: s_lshr_b32 s2, s2, s3
+; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3590,29 +3455,26 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_lshr_b32 s5, s5, 15
+; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_or_b32 s0, s0, s5
; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_lshr_b32 s5, s4, 15
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_xor_b32 s2, s2, -1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_or_b32 s3, s3, s5
; GFX8-NEXT: s_lshr_b32 s5, s2, 16
; GFX8-NEXT: s_and_b32 s6, s2, 15
; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_lshl_b32 s4, s4, 1
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s5
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT: s_andn2_b32 s2, 15, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s2, s3, s2
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4006,24 +3868,21 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
; GFX6-NEXT: s_lshl_b32 s0, s1, 1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; GFX6-NEXT: v_or_b32_e32 v3, s0, v3
-; GFX6-NEXT: s_xor_b32 s0, s2, -1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: s_xor_b32 s0, s2, -1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_and_b32 s2, s0, 15
; GFX6-NEXT: s_andn2_b32 s0, 15, s0
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, 15
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
+; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3
-; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -4133,15 +3992,13 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX6-NEXT: s_lshr_b32 s3, s3, 14
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
; GFX6-NEXT: s_xor_b32 s2, s2, -1
; GFX6-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT: s_lshl_b32 s0, s0, 1
; GFX6-NEXT: s_lshr_b32 s3, s2, 16
; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001
-; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 1
@@ -4150,8 +4007,7 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX6-NEXT: s_andn2_b32 s2, 15, s3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s2
-; GFX6-NEXT: s_lshr_b32 s0, s0, s1
+; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -4166,28 +4022,26 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0
; GFX8-NEXT: s_lshr_b32 s3, s3, 15
; GFX8-NEXT: v_mov_b32_e32 v2, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_lshr_b32 s3, s2, 15
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s4, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v1
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
; GFX8-NEXT: s_and_b32 s0, s3, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s3
; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0
; GFX8-NEXT: s_and_b32 s0, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s1, 15, s3
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -4261,15 +4115,13 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX6-NEXT: s_bfe_u32 s8, s4, 0xf0001
; GFX6-NEXT: s_lshl_b32 s1, s1, 1
; GFX6-NEXT: s_lshr_b32 s8, s8, 14
+; GFX6-NEXT: s_lshl_b32 s3, s3, 1
; GFX6-NEXT: s_xor_b32 s6, s6, -1
; GFX6-NEXT: s_or_b32 s1, s1, s8
-; GFX6-NEXT: s_lshl_b32 s3, s3, 1
; GFX6-NEXT: s_lshr_b32 s8, s6, 16
; GFX6-NEXT: s_and_b32 s9, s6, 15
; GFX6-NEXT: s_andn2_b32 s6, 15, s6
-; GFX6-NEXT: s_and_b32 s9, 0xffff, s9
; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
; GFX6-NEXT: s_lshl_b32 s0, s0, s9
; GFX6-NEXT: s_lshr_b32 s3, s3, s6
; GFX6-NEXT: s_lshl_b32 s4, s4, 1
@@ -4278,20 +4130,17 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX6-NEXT: s_andn2_b32 s6, 15, s8
; GFX6-NEXT: s_lshl_b32 s1, s1, s3
; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s6
-; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_lshr_b32 s3, s3, s6
; GFX6-NEXT: s_or_b32 s1, s1, s3
; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001
; GFX6-NEXT: s_lshl_b32 s2, s2, 1
; GFX6-NEXT: s_lshr_b32 s3, s3, 14
-; GFX6-NEXT: s_xor_b32 s4, s7, -1
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s5, 1
+; GFX6-NEXT: s_xor_b32 s4, s7, -1
; GFX6-NEXT: s_and_b32 s5, s4, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s4
-; GFX6-NEXT: s_and_b32 s5, 0xffff, s5
; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
; GFX6-NEXT: s_lshl_b32 s2, s2, s5
; GFX6-NEXT: s_lshr_b32 s3, s3, s4
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4309,43 +4158,38 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_lshr_b32 s8, s8, 15
+; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_or_b32 s0, s0, s8
; GFX8-NEXT: s_lshl_b32 s6, s6, 1
; GFX8-NEXT: s_lshr_b32 s8, s7, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_or_b32 s6, s6, s8
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s0, s0, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, s4
; GFX8-NEXT: s_lshl_b32 s7, s7, 1
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s8
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_and_b32 s6, 0xffff, s7
+; GFX8-NEXT: s_andn2_b32 s4, 15, s8
; GFX8-NEXT: s_lshr_b32 s6, s6, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s4, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_lshr_b32 s4, s4, 15
-; GFX8-NEXT: s_or_b32 s1, s1, s4
; GFX8-NEXT: s_lshl_b32 s3, s3, 1
+; GFX8-NEXT: s_or_b32 s1, s1, s4
; GFX8-NEXT: s_xor_b32 s4, s5, -1
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_and_b32 s5, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s1, s1, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -4678,15 +4522,13 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX6-NEXT: s_bfe_u32 s10, s5, 0xf0001
; GFX6-NEXT: s_lshl_b32 s1, s1, 1
; GFX6-NEXT: s_lshr_b32 s10, s10, 14
+; GFX6-NEXT: s_lshl_b32 s4, s4, 1
; GFX6-NEXT: s_xor_b32 s8, s8, -1
; GFX6-NEXT: s_or_b32 s1, s1, s10
-; GFX6-NEXT: s_lshl_b32 s4, s4, 1
; GFX6-NEXT: s_lshr_b32 s10, s8, 16
; GFX6-NEXT: s_and_b32 s11, s8, 15
; GFX6-NEXT: s_andn2_b32 s8, 15, s8
-; GFX6-NEXT: s_and_b32 s11, 0xffff, s11
; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001
-; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
; GFX6-NEXT: s_lshl_b32 s0, s0, s11
; GFX6-NEXT: s_lshr_b32 s4, s4, s8
; GFX6-NEXT: s_lshl_b32 s5, s5, 1
@@ -4695,8 +4537,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX6-NEXT: s_andn2_b32 s8, 15, s10
; GFX6-NEXT: s_lshl_b32 s1, s1, s4
; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001
-; GFX6-NEXT: s_and_b32 s5, 0xffff, s8
-; GFX6-NEXT: s_lshr_b32 s4, s4, s5
+; GFX6-NEXT: s_lshr_b32 s4, s4, s8
; GFX6-NEXT: s_or_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
@@ -4709,16 +4550,14 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX6-NEXT: s_lshl_b32 s2, s3, 1
; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001
; GFX6-NEXT: s_lshr_b32 s3, s3, 14
-; GFX6-NEXT: s_xor_b32 s5, s9, -1
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s6, 1
+; GFX6-NEXT: s_xor_b32 s5, s9, -1
; GFX6-NEXT: s_lshl_b32 s4, s7, 1
; GFX6-NEXT: s_lshr_b32 s6, s5, 16
; GFX6-NEXT: s_and_b32 s7, s5, 15
; GFX6-NEXT: s_andn2_b32 s5, 15, s5
-; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_and_b32 s5, 0xffff, s5
; GFX6-NEXT: s_lshl_b32 s1, s1, s7
; GFX6-NEXT: s_lshr_b32 s3, s3, s5
; GFX6-NEXT: s_or_b32 s1, s1, s3
@@ -4726,8 +4565,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX6-NEXT: s_andn2_b32 s5, 15, s6
; GFX6-NEXT: s_lshl_b32 s2, s2, s3
; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
-; GFX6-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_lshr_b32 s3, s3, s5
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4742,29 +4580,26 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_lshr_b32 s8, s8, 15
+; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_or_b32 s0, s0, s8
; GFX8-NEXT: s_lshl_b32 s6, s6, 1
; GFX8-NEXT: s_lshr_b32 s8, s7, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_or_b32 s6, s6, s8
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s0, s0, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, s4
; GFX8-NEXT: s_lshl_b32 s7, s7, 1
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s8
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_and_b32 s6, 0xffff, s7
+; GFX8-NEXT: s_andn2_b32 s4, 15, s8
; GFX8-NEXT: s_lshr_b32 s6, s6, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -4776,30 +4611,27 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_lshr_b32 s4, s3, 16
; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_lshr_b32 s6, s6, 15
+; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_or_b32 s1, s1, s6
; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_lshr_b32 s6, s4, 15
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_xor_b32 s5, s5, -1
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_lshr_b32 s6, s5, 16
; GFX8-NEXT: s_and_b32 s7, s5, 15
; GFX8-NEXT: s_andn2_b32 s5, 15, s5
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
; GFX8-NEXT: s_lshl_b32 s1, s1, s7
; GFX8-NEXT: s_lshr_b32 s3, s3, s5
; GFX8-NEXT: s_lshl_b32 s4, s4, 1
; GFX8-NEXT: s_or_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s3, s6, 15
-; GFX8-NEXT: s_andn2_b32 s5, 15, s6
; GFX8-NEXT: s_lshl_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT: s_andn2_b32 s5, 15, s6
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, s4
+; GFX8-NEXT: s_lshr_b32 s3, s3, s5
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 694a81a9668f3..5ede348e51f54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -76,17 +76,16 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, 0
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_min_i32 s2, s2, s3
+; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
; GFX8-NEXT: s_lshl_b32 s1, s1, 9
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
; GFX8-NEXT: s_max_i32 s1, s2, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
+; GFX8-NEXT: s_sext_i32_i16 s2, s3
; GFX8-NEXT: s_min_i32 s1, s1, s2
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
@@ -186,17 +185,16 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, 0
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_min_i32 s2, s2, s3
+; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
; GFX8-NEXT: s_max_i32 s1, s2, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
+; GFX8-NEXT: s_sext_i32_i16 s2, s3
; GFX8-NEXT: s_min_i32 s1, s1, s2
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
@@ -383,26 +381,25 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_sext_i32_i16 s5, 0
-; GFX8-NEXT: s_max_i32 s6, s4, s5
-; GFX8-NEXT: s_min_i32 s4, s4, s5
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
; GFX8-NEXT: s_lshr_b32 s3, s1, 8
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
+; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
; GFX8-NEXT: s_max_i32 s1, s4, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s4, s6
+; GFX8-NEXT: s_sext_i32_i16 s4, s5
; GFX8-NEXT: s_min_i32 s1, s1, s4
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
; GFX8-NEXT: s_sext_i32_i16 s3, s1
-; GFX8-NEXT: s_max_i32 s4, s3, s5
-; GFX8-NEXT: s_min_i32 s3, s3, s5
-; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
+; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
@@ -783,28 +780,27 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_sext_i32_i16 s9, 0
-; GFX8-NEXT: s_max_i32 s10, s8, s9
-; GFX8-NEXT: s_min_i32 s8, s8, s9
+; GFX8-NEXT: s_max_i32 s9, s8, 0
+; GFX8-NEXT: s_min_i32 s8, s8, 0
; GFX8-NEXT: s_lshr_b32 s5, s1, 8
; GFX8-NEXT: s_lshr_b32 s6, s1, 16
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8
+; GFX8-NEXT: s_sub_i32 s8, 0x8000, s8
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
+; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
; GFX8-NEXT: s_max_i32 s1, s8, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s8, s10
+; GFX8-NEXT: s_sext_i32_i16 s8, s9
; GFX8-NEXT: s_min_i32 s1, s1, s8
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
; GFX8-NEXT: s_lshl_b32 s2, s5, 8
; GFX8-NEXT: s_sext_i32_i16 s5, s1
-; GFX8-NEXT: s_max_i32 s8, s5, s9
-; GFX8-NEXT: s_min_i32 s5, s5, s9
-; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
+; GFX8-NEXT: s_max_i32 s8, s5, 0
+; GFX8-NEXT: s_min_i32 s5, s5, 0
+; GFX8-NEXT: s_sub_i32 s5, 0x8000, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8
@@ -816,9 +812,9 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
; GFX8-NEXT: s_sext_i32_i16 s5, s2
; GFX8-NEXT: s_lshl_b32 s3, s6, 8
-; GFX8-NEXT: s_max_i32 s6, s5, s9
-; GFX8-NEXT: s_min_i32 s5, s5, s9
-; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
+; GFX8-NEXT: s_max_i32 s6, s5, 0
+; GFX8-NEXT: s_min_i32 s5, s5, 0
+; GFX8-NEXT: s_sub_i32 s5, 0x8000, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
@@ -829,10 +825,10 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s3, s4, 8
; GFX8-NEXT: s_sext_i32_i16 s5, s3
-; GFX8-NEXT: s_max_i32 s6, s5, s9
-; GFX8-NEXT: s_min_i32 s5, s5, s9
+; GFX8-NEXT: s_max_i32 s6, s5, 0
+; GFX8-NEXT: s_min_i32 s5, s5, 0
; GFX8-NEXT: s_lshl_b32 s4, s7, 8
-; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
+; GFX8-NEXT: s_sub_i32 s5, 0x8000, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s1, s1
@@ -2632,16 +2628,15 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
; GFX8-LABEL: s_saddsat_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, 0
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_min_i32 s2, s2, s3
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
; GFX8-NEXT: s_max_i32 s1, s2, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
+; GFX8-NEXT: s_sext_i32_i16 s2, s3
; GFX8-NEXT: s_min_i32 s1, s1, s2
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
@@ -2680,13 +2675,12 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
; GFX8-LABEL: saddsat_i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s1, s0
-; GFX8-NEXT: s_sext_i32_i16 s2, 0
-; GFX8-NEXT: s_max_i32 s3, s1, s2
-; GFX8-NEXT: s_min_i32 s1, s1, s2
-; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1
-; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
+; GFX8-NEXT: s_max_i32 s2, s1, 0
+; GFX8-NEXT: s_min_i32 s1, s1, 0
+; GFX8-NEXT: s_sub_i32 s1, 0x8000, s1
+; GFX8-NEXT: s_sub_i32 s2, 0x7fff, s2
; GFX8-NEXT: v_max_i16_e32 v0, s1, v0
-; GFX8-NEXT: v_min_i16_e32 v0, s3, v0
+; GFX8-NEXT: v_min_i16_e32 v0, s2, v0
; GFX8-NEXT: v_add_u16_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -2838,24 +2832,23 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
; GFX8-LABEL: s_saddsat_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_sext_i32_i16 s5, 0
-; GFX8-NEXT: s_max_i32 s6, s4, s5
-; GFX8-NEXT: s_min_i32 s4, s4, s5
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
+; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
; GFX8-NEXT: s_max_i32 s1, s4, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s4, s6
+; GFX8-NEXT: s_sext_i32_i16 s4, s5
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_min_i32 s1, s1, s4
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s2
-; GFX8-NEXT: s_max_i32 s4, s1, s5
-; GFX8-NEXT: s_min_i32 s1, s1, s5
-; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1
+; GFX8-NEXT: s_max_i32 s4, s1, 0
+; GFX8-NEXT: s_min_i32 s1, s1, 0
+; GFX8-NEXT: s_sub_i32 s1, 0x8000, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
@@ -2919,22 +2912,21 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX8-LABEL: saddsat_v2i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, 0
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_min_i32 s2, s2, s3
+; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
-; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
+; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
; GFX8-NEXT: v_max_i16_e32 v1, s2, v0
; GFX8-NEXT: s_sext_i32_i16 s2, s1
-; GFX8-NEXT: v_min_i16_e32 v1, s4, v1
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_min_i32 s2, s2, s3
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: v_min_i16_e32 v1, s3, v1
+; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_min_i16_e32 v0, s4, v0
+; GFX8-NEXT: v_min_i16_e32 v0, s3, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_add_u16_e32 v1, s0, v1
; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3196,24 +3188,23 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX8-LABEL: s_saddsat_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_sext_i32_i16 s9, 0
-; GFX8-NEXT: s_max_i32 s10, s8, s9
-; GFX8-NEXT: s_min_i32 s8, s8, s9
-; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8
+; GFX8-NEXT: s_max_i32 s9, s8, 0
+; GFX8-NEXT: s_min_i32 s8, s8, 0
+; GFX8-NEXT: s_sub_i32 s8, 0x8000, s8
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
+; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
; GFX8-NEXT: s_max_i32 s2, s8, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s8, s10
+; GFX8-NEXT: s_sext_i32_i16 s8, s9
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_min_i32 s2, s2, s8
; GFX8-NEXT: s_add_i32 s0, s0, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s4
-; GFX8-NEXT: s_max_i32 s8, s2, s9
-; GFX8-NEXT: s_min_i32 s2, s2, s9
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: s_max_i32 s8, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8
@@ -3223,9 +3214,9 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX8-NEXT: s_min_i32 s2, s2, s6
; GFX8-NEXT: s_add_i32 s4, s4, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s1
-; GFX8-NEXT: s_max_i32 s6, s2, s9
-; GFX8-NEXT: s_min_i32 s2, s2, s9
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: s_max_i32 s6, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s3, s3
@@ -3237,9 +3228,9 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX8-NEXT: s_min_i32 s2, s2, s3
; GFX8-NEXT: s_add_i32 s1, s1, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s5
-; GFX8-NEXT: s_max_i32 s3, s2, s9
-; GFX8-NEXT: s_min_i32 s2, s2, s9
-; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
+; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_min_i32 s2, s2, 0
+; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s6, s7
; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
@@ -3522,24 +3513,23 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-LABEL: s_saddsat_v6i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s12, s0
-; GFX8-NEXT: s_sext_i32_i16 s13, 0
-; GFX8-NEXT: s_max_i32 s14, s12, s13
-; GFX8-NEXT: s_min_i32 s12, s12, s13
-; GFX8-NEXT: s_sub_i32 s12, 0xffff8000, s12
+; GFX8-NEXT: s_max_i32 s13, s12, 0
+; GFX8-NEXT: s_min_i32 s12, s12, 0
+; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sub_i32 s14, 0x7fff, s14
+; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13
; GFX8-NEXT: s_max_i32 s3, s12, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s12, s14
+; GFX8-NEXT: s_sext_i32_i16 s12, s13
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_min_i32 s3, s3, s12
; GFX8-NEXT: s_add_i32 s0, s0, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s6
-; GFX8-NEXT: s_max_i32 s12, s3, s13
-; GFX8-NEXT: s_min_i32 s3, s3, s13
-; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
+; GFX8-NEXT: s_max_i32 s12, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12
@@ -3549,9 +3539,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s3, s3, s9
; GFX8-NEXT: s_add_i32 s6, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s1
-; GFX8-NEXT: s_max_i32 s9, s3, s13
-; GFX8-NEXT: s_min_i32 s3, s3, s13
-; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
+; GFX8-NEXT: s_max_i32 s9, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s4, s4
@@ -3563,9 +3553,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s3, s3, s4
; GFX8-NEXT: s_add_i32 s1, s1, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s7
-; GFX8-NEXT: s_max_i32 s4, s3, s13
-; GFX8-NEXT: s_min_i32 s3, s3, s13
-; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
+; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s9, s10
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
@@ -3575,9 +3565,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s3, s3, s4
; GFX8-NEXT: s_add_i32 s7, s7, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s2
-; GFX8-NEXT: s_max_i32 s4, s3, s13
-; GFX8-NEXT: s_min_i32 s3, s3, s13
-; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
+; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_lshr_b32 s11, s5, 16
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s5, s5
@@ -3589,9 +3579,9 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s3, s3, s4
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s8
-; GFX8-NEXT: s_max_i32 s4, s3, s13
-; GFX8-NEXT: s_min_i32 s3, s3, s13
-; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
+; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s5, s11
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
@@ -3937,24 +3927,23 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-LABEL: s_saddsat_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s16, s0
-; GFX8-NEXT: s_sext_i32_i16 s17, 0
-; GFX8-NEXT: s_max_i32 s18, s16, s17
-; GFX8-NEXT: s_min_i32 s16, s16, s17
-; GFX8-NEXT: s_sub_i32 s16, 0xffff8000, s16
+; GFX8-NEXT: s_max_i32 s17, s16, 0
+; GFX8-NEXT: s_min_i32 s16, s16, 0
+; GFX8-NEXT: s_sub_i32 s16, 0x8000, s16
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
; GFX8-NEXT: s_sext_i32_i16 s16, s16
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sub_i32 s18, 0x7fff, s18
+; GFX8-NEXT: s_sub_i32 s17, 0x7fff, s17
; GFX8-NEXT: s_max_i32 s4, s16, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s16, s18
+; GFX8-NEXT: s_sext_i32_i16 s16, s17
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
; GFX8-NEXT: s_min_i32 s4, s4, s16
; GFX8-NEXT: s_add_i32 s0, s0, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s8
-; GFX8-NEXT: s_max_i32 s16, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s16, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16
@@ -3964,9 +3953,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s12
; GFX8-NEXT: s_add_i32 s8, s8, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s1
-; GFX8-NEXT: s_max_i32 s12, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s12, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s5, s5
@@ -3978,9 +3967,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s5
; GFX8-NEXT: s_add_i32 s1, s1, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s9
-; GFX8-NEXT: s_max_i32 s5, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s12, s13
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
@@ -3990,9 +3979,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s5
; GFX8-NEXT: s_add_i32 s9, s9, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s2
-; GFX8-NEXT: s_max_i32 s5, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s6
@@ -4004,9 +3993,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s5
; GFX8-NEXT: s_add_i32 s2, s2, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s10
-; GFX8-NEXT: s_max_i32 s5, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s14
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
@@ -4016,9 +4005,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s5
; GFX8-NEXT: s_add_i32 s10, s10, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s3
-; GFX8-NEXT: s_max_i32 s5, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s7
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
@@ -4029,10 +4018,10 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s5
; GFX8-NEXT: s_add_i32 s3, s3, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s11
-; GFX8-NEXT: s_max_i32 s5, s4, s17
-; GFX8-NEXT: s_min_i32 s4, s4, s17
+; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_min_i32 s4, s4, 0
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
-; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
+; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s15
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index be8cb23293176..6928c5a025f18 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -548,32 +548,16 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
}
define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
-; GFX7-LABEL: s_shl_i32_zext_i16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff
-; GFX7-NEXT: s_lshl_b32 s0, s0, 2
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_shl_i32_zext_i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 2
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_shl_i32_zext_i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_shl_i32_zext_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b32 s0, s0, 0x3fff
+; GCN-NEXT: s_lshl_b32 s0, s0, 2
+; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and i16 %x, 16383
%ext = zext i16 %and to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 9fac482cb01ba..9ebf89519d6c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -76,14 +76,13 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, -1
-; GFX8-NEXT: s_max_i32 s4, s2, s3
+; GFX8-NEXT: s_max_i32 s3, s2, -1
; GFX8-NEXT: s_lshl_b32 s1, s1, 9
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s4
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8001
+; GFX8-NEXT: s_min_i32 s2, s2, -1
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
; GFX8-NEXT: s_max_i32 s1, s3, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s2, s2
@@ -186,14 +185,13 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, -1
-; GFX8-NEXT: s_max_i32 s4, s2, s3
+; GFX8-NEXT: s_max_i32 s3, s2, -1
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s4
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8001
+; GFX8-NEXT: s_min_i32 s2, s2, -1
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
; GFX8-NEXT: s_max_i32 s1, s3, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s2, s2
@@ -384,16 +382,15 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_sext_i32_i16 s5, -1
-; GFX8-NEXT: s_max_i32 s6, s4, s5
+; GFX8-NEXT: s_max_i32 s5, s4, -1
; GFX8-NEXT: s_lshr_b32 s3, s1, 8
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
-; GFX8-NEXT: s_min_i32 s4, s4, s5
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8001
+; GFX8-NEXT: s_min_i32 s4, s4, -1
+; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s4, 0x8000
-; GFX8-NEXT: s_max_i32 s1, s6, s1
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
+; GFX8-NEXT: s_max_i32 s1, s5, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_min_i32 s1, s1, s4
@@ -401,12 +398,12 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
; GFX8-NEXT: s_sext_i32_i16 s3, s1
-; GFX8-NEXT: s_max_i32 s4, s3, s5
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s3, s3, s5
+; GFX8-NEXT: s_max_i32 s4, s3, -1
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8001
+; GFX8-NEXT: s_min_i32 s3, s3, -1
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_addk_i32 s3, 0x8000
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
; GFX8-NEXT: s_max_i32 s2, s4, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s3, s3
@@ -784,18 +781,17 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_sext_i32_i16 s9, -1
-; GFX8-NEXT: s_max_i32 s10, s8, s9
+; GFX8-NEXT: s_max_i32 s9, s8, -1
; GFX8-NEXT: s_lshr_b32 s5, s1, 8
; GFX8-NEXT: s_lshr_b32 s6, s1, 16
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_addk_i32 s10, 0x8001
-; GFX8-NEXT: s_min_i32 s8, s8, s9
-; GFX8-NEXT: s_sext_i32_i16 s10, s10
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
+; GFX8-NEXT: s_min_i32 s8, s8, -1
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s8, 0x8000
-; GFX8-NEXT: s_max_i32 s1, s10, s1
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
+; GFX8-NEXT: s_max_i32 s1, s9, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_min_i32 s1, s1, s8
@@ -803,12 +799,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
; GFX8-NEXT: s_lshl_b32 s2, s5, 8
; GFX8-NEXT: s_sext_i32_i16 s5, s1
-; GFX8-NEXT: s_max_i32 s8, s5, s9
-; GFX8-NEXT: s_addk_i32 s8, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, s9
+; GFX8-NEXT: s_max_i32 s8, s5, -1
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
+; GFX8-NEXT: s_min_i32 s5, s5, -1
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_addk_i32 s5, 0x8000
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
; GFX8-NEXT: s_max_i32 s2, s8, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s5, s5
@@ -817,12 +813,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
; GFX8-NEXT: s_sext_i32_i16 s5, s2
; GFX8-NEXT: s_lshl_b32 s3, s6, 8
-; GFX8-NEXT: s_max_i32 s6, s5, s9
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, s9
+; GFX8-NEXT: s_max_i32 s6, s5, -1
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
+; GFX8-NEXT: s_min_i32 s5, s5, -1
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_addk_i32 s5, 0x8000
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
; GFX8-NEXT: s_max_i32 s3, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s5, s5
@@ -830,14 +826,14 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_sub_i32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s3, s4, 8
; GFX8-NEXT: s_sext_i32_i16 s5, s3
-; GFX8-NEXT: s_max_i32 s6, s5, s9
+; GFX8-NEXT: s_max_i32 s6, s5, -1
; GFX8-NEXT: s_lshl_b32 s4, s7, 8
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, s9
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
+; GFX8-NEXT: s_min_i32 s5, s5, -1
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s5, 0x8000
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
; GFX8-NEXT: s_max_i32 s4, s6, s4
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: s_ashr_i32 s1, s1, 8
@@ -2635,13 +2631,12 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
; GFX8-LABEL: s_ssubsat_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, -1
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s4
+; GFX8-NEXT: s_max_i32 s3, s2, -1
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8001
+; GFX8-NEXT: s_min_i32 s2, s2, -1
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
; GFX8-NEXT: s_max_i32 s1, s3, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s2, s2
@@ -2683,12 +2678,11 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
; GFX8-LABEL: ssubsat_i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s1, s0
-; GFX8-NEXT: s_sext_i32_i16 s2, -1
-; GFX8-NEXT: s_max_i32 s3, s1, s2
-; GFX8-NEXT: s_addk_i32 s3, 0x8001
-; GFX8-NEXT: s_min_i32 s1, s1, s2
-; GFX8-NEXT: s_addk_i32 s1, 0x8000
-; GFX8-NEXT: v_max_i16_e32 v0, s3, v0
+; GFX8-NEXT: s_max_i32 s2, s1, -1
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8001
+; GFX8-NEXT: s_min_i32 s1, s1, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 0x8000
+; GFX8-NEXT: v_max_i16_e32 v0, s2, v0
; GFX8-NEXT: v_min_i16_e32 v0, s1, v0
; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
@@ -2842,27 +2836,26 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
; GFX8-LABEL: s_ssubsat_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_sext_i32_i16 s5, -1
-; GFX8-NEXT: s_max_i32 s6, s4, s5
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
+; GFX8-NEXT: s_max_i32 s5, s4, -1
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8001
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_min_i32 s4, s4, s5
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_min_i32 s4, s4, -1
+; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_addk_i32 s4, 0x8000
-; GFX8-NEXT: s_max_i32 s1, s6, s1
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
+; GFX8-NEXT: s_max_i32 s1, s5, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_min_i32 s1, s1, s4
; GFX8-NEXT: s_sub_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s2
-; GFX8-NEXT: s_max_i32 s4, s1, s5
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s1, s1, s5
+; GFX8-NEXT: s_max_i32 s4, s1, -1
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8001
+; GFX8-NEXT: s_min_i32 s1, s1, -1
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_addk_i32 s1, 0x8000
+; GFX8-NEXT: s_add_i32 s1, s1, 0x8000
; GFX8-NEXT: s_max_i32 s3, s4, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s1, s1
@@ -2923,20 +2916,19 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX8-LABEL: ssubsat_v2i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s2, s0
-; GFX8-NEXT: s_sext_i32_i16 s3, -1
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, s3
+; GFX8-NEXT: s_max_i32 s3, s2, -1
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8001
+; GFX8-NEXT: s_min_i32 s2, s2, -1
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
-; GFX8-NEXT: v_max_i16_e32 v1, s4, v0
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
+; GFX8-NEXT: v_max_i16_e32 v1, s3, v0
; GFX8-NEXT: v_min_i16_e32 v1, s2, v1
; GFX8-NEXT: s_sext_i32_i16 s2, s1
-; GFX8-NEXT: s_max_i32 s4, s2, s3
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
+; GFX8-NEXT: s_max_i32 s3, s2, -1
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8001
+; GFX8-NEXT: s_min_i32 s2, s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_min_i16_e32 v0, s2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s1
@@ -3202,40 +3194,39 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX8-LABEL: s_ssubsat_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_sext_i32_i16 s9, -1
-; GFX8-NEXT: s_max_i32 s10, s8, s9
-; GFX8-NEXT: s_addk_i32 s10, 0x8001
+; GFX8-NEXT: s_max_i32 s9, s8, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_min_i32 s8, s8, s9
-; GFX8-NEXT: s_sext_i32_i16 s10, s10
+; GFX8-NEXT: s_min_i32 s8, s8, -1
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_addk_i32 s8, 0x8000
-; GFX8-NEXT: s_max_i32 s2, s10, s2
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
+; GFX8-NEXT: s_max_i32 s2, s9, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_min_i32 s2, s2, s8
; GFX8-NEXT: s_sub_i32 s0, s0, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s4
-; GFX8-NEXT: s_max_i32 s8, s2, s9
-; GFX8-NEXT: s_addk_i32 s8, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, s9
+; GFX8-NEXT: s_max_i32 s8, s2, -1
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
+; GFX8-NEXT: s_min_i32 s2, s2, -1
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
+; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
; GFX8-NEXT: s_max_i32 s6, s8, s6
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s2, s2
; GFX8-NEXT: s_min_i32 s2, s6, s2
; GFX8-NEXT: s_sub_i32 s2, s4, s2
; GFX8-NEXT: s_sext_i32_i16 s4, s1
-; GFX8-NEXT: s_max_i32 s6, s4, s9
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
+; GFX8-NEXT: s_max_i32 s6, s4, -1
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
-; GFX8-NEXT: s_min_i32 s4, s4, s9
+; GFX8-NEXT: s_min_i32 s4, s4, -1
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_addk_i32 s4, 0x8000
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
; GFX8-NEXT: s_max_i32 s3, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s4, s4
@@ -3243,12 +3234,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX8-NEXT: s_min_i32 s3, s3, s4
; GFX8-NEXT: s_sub_i32 s1, s1, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s5
-; GFX8-NEXT: s_max_i32 s4, s3, s9
-; GFX8-NEXT: s_addk_i32 s4, 0x8001
-; GFX8-NEXT: s_min_i32 s3, s3, s9
+; GFX8-NEXT: s_max_i32 s4, s3, -1
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8001
+; GFX8-NEXT: s_min_i32 s3, s3, -1
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s7
-; GFX8-NEXT: s_addk_i32 s3, 0x8000
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
; GFX8-NEXT: s_max_i32 s4, s4, s6
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s3, s3
@@ -3528,40 +3519,39 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-LABEL: s_ssubsat_v6i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s12, s0
-; GFX8-NEXT: s_sext_i32_i16 s13, -1
-; GFX8-NEXT: s_max_i32 s14, s12, s13
-; GFX8-NEXT: s_addk_i32 s14, 0x8001
+; GFX8-NEXT: s_max_i32 s13, s12, -1
+; GFX8-NEXT: s_add_i32 s13, s13, 0x8001
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_min_i32 s12, s12, s13
-; GFX8-NEXT: s_sext_i32_i16 s14, s14
+; GFX8-NEXT: s_min_i32 s12, s12, -1
+; GFX8-NEXT: s_sext_i32_i16 s13, s13
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_addk_i32 s12, 0x8000
-; GFX8-NEXT: s_max_i32 s3, s14, s3
+; GFX8-NEXT: s_add_i32 s12, s12, 0x8000
+; GFX8-NEXT: s_max_i32 s3, s13, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_min_i32 s3, s3, s12
; GFX8-NEXT: s_sub_i32 s0, s0, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s6
-; GFX8-NEXT: s_max_i32 s12, s3, s13
-; GFX8-NEXT: s_addk_i32 s12, 0x8001
-; GFX8-NEXT: s_min_i32 s3, s3, s13
+; GFX8-NEXT: s_max_i32 s12, s3, -1
+; GFX8-NEXT: s_add_i32 s12, s12, 0x8001
+; GFX8-NEXT: s_min_i32 s3, s3, -1
; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_addk_i32 s3, 0x8000
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
; GFX8-NEXT: s_max_i32 s9, s12, s9
; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_min_i32 s3, s9, s3
; GFX8-NEXT: s_sub_i32 s3, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s6, s1
-; GFX8-NEXT: s_max_i32 s9, s6, s13
-; GFX8-NEXT: s_addk_i32 s9, 0x8001
+; GFX8-NEXT: s_max_i32 s9, s6, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
-; GFX8-NEXT: s_min_i32 s6, s6, s13
+; GFX8-NEXT: s_min_i32 s6, s6, -1
; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_addk_i32 s6, 0x8000
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
; GFX8-NEXT: s_max_i32 s4, s9, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s6
@@ -3569,25 +3559,25 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, s6
; GFX8-NEXT: s_sub_i32 s1, s1, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s7
-; GFX8-NEXT: s_max_i32 s6, s4, s13
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
-; GFX8-NEXT: s_min_i32 s4, s4, s13
+; GFX8-NEXT: s_max_i32 s6, s4, -1
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
+; GFX8-NEXT: s_min_i32 s4, s4, -1
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s9, s10
-; GFX8-NEXT: s_addk_i32 s4, 0x8000
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
; GFX8-NEXT: s_max_i32 s6, s6, s9
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_min_i32 s4, s6, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s2
; GFX8-NEXT: s_sub_i32 s4, s7, s4
-; GFX8-NEXT: s_max_i32 s7, s6, s13
-; GFX8-NEXT: s_addk_i32 s7, 0x8001
+; GFX8-NEXT: s_max_i32 s7, s6, -1
+; GFX8-NEXT: s_add_i32 s7, s7, 0x8001
; GFX8-NEXT: s_lshr_b32 s11, s5, 16
-; GFX8-NEXT: s_min_i32 s6, s6, s13
+; GFX8-NEXT: s_min_i32 s6, s6, -1
; GFX8-NEXT: s_sext_i32_i16 s7, s7
; GFX8-NEXT: s_sext_i32_i16 s5, s5
-; GFX8-NEXT: s_addk_i32 s6, 0x8000
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
; GFX8-NEXT: s_max_i32 s5, s7, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s6, s6
@@ -3595,12 +3585,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s5, s5, s6
; GFX8-NEXT: s_sub_i32 s2, s2, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s8
-; GFX8-NEXT: s_max_i32 s6, s5, s13
-; GFX8-NEXT: s_addk_i32 s6, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, s13
+; GFX8-NEXT: s_max_i32 s6, s5, -1
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
+; GFX8-NEXT: s_min_i32 s5, s5, -1
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s7, s11
-; GFX8-NEXT: s_addk_i32 s5, 0x8000
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
; GFX8-NEXT: s_max_i32 s6, s6, s7
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_sext_i32_i16 s6, s6
@@ -3943,40 +3933,39 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-LABEL: s_ssubsat_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sext_i32_i16 s16, s0
-; GFX8-NEXT: s_sext_i32_i16 s17, -1
-; GFX8-NEXT: s_max_i32 s18, s16, s17
-; GFX8-NEXT: s_addk_i32 s18, 0x8001
+; GFX8-NEXT: s_max_i32 s17, s16, -1
+; GFX8-NEXT: s_add_i32 s17, s17, 0x8001
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
-; GFX8-NEXT: s_min_i32 s16, s16, s17
-; GFX8-NEXT: s_sext_i32_i16 s18, s18
+; GFX8-NEXT: s_min_i32 s16, s16, -1
+; GFX8-NEXT: s_sext_i32_i16 s17, s17
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_addk_i32 s16, 0x8000
-; GFX8-NEXT: s_max_i32 s4, s18, s4
+; GFX8-NEXT: s_add_i32 s16, s16, 0x8000
+; GFX8-NEXT: s_max_i32 s4, s17, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s16, s16
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
; GFX8-NEXT: s_min_i32 s4, s4, s16
; GFX8-NEXT: s_sub_i32 s0, s0, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s8
-; GFX8-NEXT: s_max_i32 s16, s4, s17
-; GFX8-NEXT: s_addk_i32 s16, 0x8001
-; GFX8-NEXT: s_min_i32 s4, s4, s17
+; GFX8-NEXT: s_max_i32 s16, s4, -1
+; GFX8-NEXT: s_add_i32 s16, s16, 0x8001
+; GFX8-NEXT: s_min_i32 s4, s4, -1
; GFX8-NEXT: s_sext_i32_i16 s16, s16
; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_addk_i32 s4, 0x8000
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
; GFX8-NEXT: s_max_i32 s12, s16, s12
; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_min_i32 s4, s12, s4
; GFX8-NEXT: s_sub_i32 s4, s8, s4
; GFX8-NEXT: s_sext_i32_i16 s8, s1
-; GFX8-NEXT: s_max_i32 s12, s8, s17
-; GFX8-NEXT: s_addk_i32 s12, 0x8001
+; GFX8-NEXT: s_max_i32 s12, s8, -1
+; GFX8-NEXT: s_add_i32 s12, s12, 0x8001
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
-; GFX8-NEXT: s_min_i32 s8, s8, s17
+; GFX8-NEXT: s_min_i32 s8, s8, -1
; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_sext_i32_i16 s5, s5
-; GFX8-NEXT: s_addk_i32 s8, 0x8000
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
; GFX8-NEXT: s_max_i32 s5, s12, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s8, s8
@@ -3984,25 +3973,25 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s5, s5, s8
; GFX8-NEXT: s_sub_i32 s1, s1, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s9
-; GFX8-NEXT: s_max_i32 s8, s5, s17
-; GFX8-NEXT: s_addk_i32 s8, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, s17
+; GFX8-NEXT: s_max_i32 s8, s5, -1
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
+; GFX8-NEXT: s_min_i32 s5, s5, -1
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s12, s13
-; GFX8-NEXT: s_addk_i32 s5, 0x8000
+; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
; GFX8-NEXT: s_max_i32 s8, s8, s12
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_min_i32 s5, s8, s5
; GFX8-NEXT: s_sext_i32_i16 s8, s2
; GFX8-NEXT: s_sub_i32 s5, s9, s5
-; GFX8-NEXT: s_max_i32 s9, s8, s17
-; GFX8-NEXT: s_addk_i32 s9, 0x8001
+; GFX8-NEXT: s_max_i32 s9, s8, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
-; GFX8-NEXT: s_min_i32 s8, s8, s17
+; GFX8-NEXT: s_min_i32 s8, s8, -1
; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_addk_i32 s8, 0x8000
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
; GFX8-NEXT: s_max_i32 s6, s9, s6
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s8, s8
@@ -4010,24 +3999,24 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s6, s6, s8
; GFX8-NEXT: s_sub_i32 s2, s2, s6
; GFX8-NEXT: s_sext_i32_i16 s6, s10
-; GFX8-NEXT: s_max_i32 s8, s6, s17
-; GFX8-NEXT: s_addk_i32 s8, 0x8001
-; GFX8-NEXT: s_min_i32 s6, s6, s17
+; GFX8-NEXT: s_max_i32 s8, s6, -1
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
+; GFX8-NEXT: s_min_i32 s6, s6, -1
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s9, s14
-; GFX8-NEXT: s_addk_i32 s6, 0x8000
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
; GFX8-NEXT: s_max_i32 s8, s8, s9
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_min_i32 s6, s8, s6
; GFX8-NEXT: s_sext_i32_i16 s8, s3
-; GFX8-NEXT: s_max_i32 s9, s8, s17
-; GFX8-NEXT: s_addk_i32 s9, 0x8001
+; GFX8-NEXT: s_max_i32 s9, s8, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
-; GFX8-NEXT: s_min_i32 s8, s8, s17
+; GFX8-NEXT: s_min_i32 s8, s8, -1
; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s7, s7
-; GFX8-NEXT: s_addk_i32 s8, 0x8000
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
; GFX8-NEXT: s_max_i32 s7, s9, s7
; GFX8-NEXT: s_sext_i32_i16 s7, s7
; GFX8-NEXT: s_sext_i32_i16 s8, s8
@@ -4035,15 +4024,15 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s7, s7, s8
; GFX8-NEXT: s_sub_i32 s3, s3, s7
; GFX8-NEXT: s_sext_i32_i16 s7, s11
-; GFX8-NEXT: s_max_i32 s8, s7, s17
-; GFX8-NEXT: s_addk_i32 s8, 0x8001
+; GFX8-NEXT: s_max_i32 s8, s7, -1
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_min_i32 s7, s7, s17
+; GFX8-NEXT: s_min_i32 s7, s7, -1
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s9, s15
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_addk_i32 s7, 0x8000
+; GFX8-NEXT: s_add_i32 s7, s7, 0x8000
; GFX8-NEXT: s_max_i32 s8, s8, s9
; GFX8-NEXT: s_or_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 8c0393b627110..2378284a521f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -34,7 +34,6 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX7GLISEL-NEXT: s_mov_b32 s2, -1
; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff
-; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3
; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00
; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0
; GFX7GLISEL-NEXT: s_bfe_i32 s3, s3, 0x10000
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 556c553cfd7d5..a0cfcf671ed0b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -214,17 +214,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; GISEL-VI-LABEL: basic_smax_smin_sgpr:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0
-; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_max_i32 s3, s3, s4
-; GISEL-VI-NEXT: s_max_i32 s2, s2, s4
+; GISEL-VI-NEXT: s_max_i32 s3, s3, 0
+; GISEL-VI-NEXT: s_max_i32 s2, s2, 0
; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_min_i32 s3, s3, s5
-; GISEL-VI-NEXT: s_min_i32 s2, s2, s5
+; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff
+; GISEL-VI-NEXT: s_min_i32 s2, s2, 0xff
; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
@@ -238,18 +236,16 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
; GISEL-GFX9: ; %bb.0:
; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0
-; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff
; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4
-; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4
+; GISEL-GFX9-NEXT: s_max_i32 s2, s2, 0
+; GISEL-GFX9-NEXT: s_max_i32 s3, s3, 0
; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5
-; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5
+; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff
+; GISEL-GFX9-NEXT: s_min_i32 s3, s3, 0xff
; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2
; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1]
@@ -258,18 +254,16 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-GFX11-NEXT: s_max_i32 s2, s2, s4
-; GISEL-GFX11-NEXT: s_max_i32 s3, s3, s4
+; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0
+; GISEL-GFX11-NEXT: s_max_i32 s3, s3, 0
; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-GFX11-NEXT: s_min_i32 s2, s2, s5
-; GISEL-GFX11-NEXT: s_min_i32 s3, s3, s5
+; GISEL-GFX11-NEXT: s_min_i32 s2, s2, 0xff
+; GISEL-GFX11-NEXT: s_min_i32 s3, s3, 0xff
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
@@ -279,18 +273,16 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; GISEL-GFX12-LABEL: basic_smax_smin_sgpr:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, 0
-; GISEL-GFX12-NEXT: s_sext_i32_i16 s5, 0xff
; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-GFX12-NEXT: s_max_i32 s2, s2, s4
-; GISEL-GFX12-NEXT: s_max_i32 s3, s3, s4
+; GISEL-GFX12-NEXT: s_max_i32 s2, s2, 0
+; GISEL-GFX12-NEXT: s_max_i32 s3, s3, 0
; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-GFX12-NEXT: s_min_i32 s2, s2, s5
-; GISEL-GFX12-NEXT: s_min_i32 s3, s3, s5
+; GISEL-GFX12-NEXT: s_min_i32 s2, s2, 0xff
+; GISEL-GFX12-NEXT: s_min_i32 s3, s3, 0xff
; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s2
@@ -694,18 +686,16 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16
+; GISEL-VI-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4
-; GISEL-VI-NEXT: s_max_i32 s2, s2, s3
-; GISEL-VI-NEXT: s_max_i32 s3, s4, s3
-; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff
+; GISEL-VI-NEXT: s_max_i32 s3, s3, 0
+; GISEL-VI-NEXT: s_max_i32 s2, s2, 0
; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_min_i32 s3, s3, s4
-; GISEL-VI-NEXT: s_min_i32 s2, s2, s4
+; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff
+; GISEL-VI-NEXT: s_min_i32 s2, s2, 0xff
; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
More information about the llvm-commits
mailing list