[llvm] [AMDGPU] Disable -amdgpu-codegenprepare-widen-16-bit-ops by default (PR #111710)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 9 09:43:49 PDT 2024


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/111710

Fixes #64591


>From 340ddba33a7f02ed7735143a730db2d6064e734a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 9 Oct 2024 17:35:30 +0100
Subject: [PATCH] [AMDGPU] Disable -amdgpu-codegenprepare-widen-16-bit-ops by
 default

Fixes #64591
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |   2 +-
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |  70 ++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   | 163 ++++---
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   | 149 +++++--
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |  77 +---
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   | 169 +++----
 .../AMDGPU/GlobalISel/shl-ext-reduce.ll       |   7 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  88 ++--
 .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll    |  54 ++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll   |  26 +-
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  33 +-
 llvm/test/CodeGen/AMDGPU/anyext.ll            |   8 +-
 llvm/test/CodeGen/AMDGPU/bitreverse.ll        |   7 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     | 413 +++++++++---------
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |  28 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |   6 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |  15 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  34 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |  13 +-
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  14 +-
 llvm/test/CodeGen/AMDGPU/min.ll               | 260 ++++++-----
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  19 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |   6 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     | 290 ++++++------
 llvm/test/CodeGen/AMDGPU/sra.ll               |  79 ++--
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  34 +-
 llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll  |   2 +-
 27 files changed, 977 insertions(+), 1089 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..d1654f8daea9dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -49,7 +49,7 @@ static cl::opt<bool> Widen16BitOps(
   "amdgpu-codegenprepare-widen-16-bit-ops",
   cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
   cl::ReallyHidden,
-  cl::init(true));
+  cl::init(false));
 
 static cl::opt<bool>
     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index c8b82716a9fe13..9c25a07bc8dc3d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -281,12 +281,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
-; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_addk_i32 s1, 0xffc0
+; GFX8-NEXT:    s_addk_i32 s0, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat:
@@ -323,12 +323,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
 ; GFX8-NEXT:    s_add_i32 s1, s1, 4
+; GFX8-NEXT:    s_addk_i32 s0, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo:
@@ -365,12 +365,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_addk_i32 s1, 0xffc0
 ; GFX8-NEXT:    s_add_i32 s0, s0, 4
-; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi:
@@ -408,14 +408,13 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX8-LABEL: s_add_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16:
@@ -461,14 +460,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_lhs:
@@ -517,14 +515,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_rhs:
@@ -580,14 +577,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..58ae28bc48f4aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -79,22 +79,30 @@ define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) {
 ;
 ; GFX8-LABEL: s_ashr_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_sext_i32_i8 s1, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_ashr_i8:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    s_sext_i32_i8 s1, s1
+; GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i8:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT:    s_sext_i32_i8 s1, s1
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i8 %value, %amount
@@ -102,15 +110,30 @@ define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) {
 }
 
 define amdgpu_ps i8 @s_ashr_i8_7(i8 inreg %value) {
-; GCN-LABEL: s_ashr_i8_7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_sext_i32_i8 s0, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_ashr_i8_7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 7
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_ashr_i8_7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 15
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_ashr_i8_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sext_i32_i8 s0, s0
+; GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 7
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
+; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 7
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i8 %value, 7
@@ -652,21 +675,21 @@ define amdgpu_ps i16 @s_ashr_i16(i16 inreg %value, i16 inreg %amount) {
 ; GFX8-LABEL: s_ashr_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_ashr_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    s_sext_i32_i16 s1, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i16:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT:    s_sext_i32_i16 s1, s1
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i16 %value, %amount
@@ -827,14 +850,16 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ;
 ; GFX8-LABEL: s_ashr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s2, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT:    s_ashr_i32 s2, s2, s3
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT:    s_sext_i32_i16 s1, s2
+; GFX8-NEXT:    s_ashr_i32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1029,23 +1054,27 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ;
 ; GFX8-LABEL: s_ashr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s4, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s5, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s6, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s7, s3
-; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x100010
-; GFX8-NEXT:    s_ashr_i32 s4, s4, s6
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
-; GFX8-NEXT:    s_ashr_i32 s2, s5, s7
+; GFX8-NEXT:    s_sext_i32_i16 s2, s4
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s3
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s0, s3
+; GFX8-NEXT:    s_sext_i32_i16 s3, s5
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1236,41 +1265,49 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ;
 ; GFX8-LABEL: s_ashr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s8, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s9, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s12, s4
-; GFX8-NEXT:    s_bfe_i32 s4, s4, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s13, s5
-; GFX8-NEXT:    s_bfe_i32 s5, s5, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s10, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s14, s6
-; GFX8-NEXT:    s_bfe_i32 s6, s6, 0x100010
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
-; GFX8-NEXT:    s_ashr_i32 s4, s9, s13
+; GFX8-NEXT:    s_sext_i32_i16 s4, s8
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_ashr_i32 s4, s4, s12
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s5
-; GFX8-NEXT:    s_sext_i32_i16 s11, s3
-; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s15, s7
-; GFX8-NEXT:    s_bfe_i32 s7, s7, 0x100010
-; GFX8-NEXT:    s_ashr_i32 s5, s10, s14
+; GFX8-NEXT:    s_sext_i32_i16 s5, s9
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_ashr_i32 s5, s5, s13
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s8, s8, s12
-; GFX8-NEXT:    s_ashr_i32 s6, s11, s15
+; GFX8-NEXT:    s_sext_i32_i16 s6, s10
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_ashr_i32 s6, s6, s14
+; GFX8-NEXT:    s_sext_i32_i16 s3, s3
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
+; GFX8-NEXT:    s_sext_i32_i16 s7, s11
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s4, s5, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s7, s8, 0xffff
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_ashr_i32 s7, s7, s15
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_and_b32 s4, s6, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s0, s7
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..6ebd8c6146095b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -69,15 +69,36 @@ define i8 @v_lshr_i8_7(i8 %value) {
 }
 
 define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
-; GCN-LABEL: s_lshr_i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i8:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, %amount
@@ -85,14 +106,30 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
 }
 
 define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
-; GCN-LABEL: s_lshr_i8_7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x10007
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8_7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x10007
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8_7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x10007
+; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, 7
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, 7
   ret i8 %result
@@ -619,15 +656,30 @@ define i16 @v_lshr_i16_15(i16 %value) {
 }
 
 define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
-; GCN-LABEL: s_lshr_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i16 %value, %amount
@@ -635,14 +687,27 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
 }
 
 define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
-; GCN-LABEL: s_lshr_i16_15:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1000f
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i16_15:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x1000f
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i16_15:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 15
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i16_15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 15
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1000f
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i16 %value, 15
   ret i16 %result
@@ -783,13 +848,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX8-LABEL: s_lshr_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshr_b32 s1, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v2i16:
@@ -970,21 +1035,21 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8-LABEL: s_lshr_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, s6
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshr_b32 s3, s5, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v4i16:
@@ -1155,37 +1220,37 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8-LABEL: s_lshr_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s8, s12
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s9, s13
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s10, s14
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s7
 ; GFX8-NEXT:    s_lshr_b32 s7, s11, s15
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s2, s4, s2
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 42f1bf84c04207..306b5579bebbcb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -7,37 +7,18 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
-; GFX7-LABEL: s_mul_i16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mul_i32 s0, s0, s1
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_mul_i16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_mul_i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_mul_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_mul_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
@@ -93,35 +74,27 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
 ;
 ; GFX8-LABEL: s_mul_i16_zeroext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_mul_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_mul_i16_zeroext:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i16_zeroext:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
@@ -170,42 +143,22 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
 }
 
 define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
-; GFX7-LABEL: s_mul_i16_signext:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mul_i32 s0, s0, s1
-; GFX7-NEXT:    s_sext_i32_i16 s0, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_mul_i16_signext:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_mul_i16_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_mul_i16_signext:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    s_sext_i32_i16 s0, s0
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_mul_i16_signext:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i16_signext:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..fc852aa416cd7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -40,30 +40,14 @@ define i8 @v_sext_inreg_i8_7(i8 %value) {
 }
 
 define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) {
-; GFX6-LABEL: s_sext_inreg_i8:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x50000
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_sext_inreg_i8:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 3
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_sext_inreg_i8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 3
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_sext_inreg_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x50000
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 3
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x50000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i8 %value, 3
   %ashr = ashr i8 %shl, 3
@@ -71,30 +55,14 @@ define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) {
 }
 
 define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) {
-; GFX6-LABEL: s_sext_inreg_i8_6:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_sext_inreg_i8_6:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 6
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_sext_inreg_i8_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 6
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_sext_inreg_i8_6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x20000
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i8_6:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 6
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x20000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i8 %value, 6
   %ashr = ashr i8 %shl, 6
@@ -585,16 +553,12 @@ define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) {
 ;
 ; GFX9-LABEL: s_sext_inreg_i16_9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 9
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 9
+; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x70000
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i16_9:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
-; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 9
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x70000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i16 %value, 9
   %ashr = ashr i16 %shl, 9
@@ -616,16 +580,12 @@ define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) {
 ;
 ; GFX9-LABEL: s_sext_inreg_i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 15
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 15
+; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 15
-; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 15
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i16 %value, 15
   %ashr = ashr i16 %shl, 15
@@ -720,15 +680,16 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) {
 ; GFX8-LABEL: s_sext_inreg_v2i16_11:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 11
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 11
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 11
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 11
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, 11
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 11
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_sext_inreg_v2i16_11:
@@ -854,25 +815,27 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) {
 ; GFX8-LABEL: s_sext_inreg_v4i16_14:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 14
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 14
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 14
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 14
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 14
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 14
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 14
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 14
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 14
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 14
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 14
+; GFX8-NEXT:    s_sext_i32_i16 s3, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 14
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 14
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_sext_inreg_v4i16_14:
@@ -1068,45 +1031,49 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) {
 ; GFX8-LABEL: s_sext_inreg_v8i16_5:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 5
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_ashr_i32 s4, s4, 5
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 5
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 5
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 5
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s5, s5
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_ashr_i32 s5, s5, 5
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 5
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 5
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 5
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 5
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 5
-; GFX8-NEXT:    s_ashr_i32 s4, s4, 5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 5
-; GFX8-NEXT:    s_lshl_b32 s7, s7, 5
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 5
-; GFX8-NEXT:    s_ashr_i32 s5, s5, 5
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_ashr_i32 s6, s6, 5
+; GFX8-NEXT:    s_lshl_b32 s7, s7, 5
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 5
-; GFX8-NEXT:    s_ashr_i32 s6, s6, 5
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s3, s3, 5
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 5
+; GFX8-NEXT:    s_sext_i32_i16 s7, s7
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_ashr_i32 s7, s7, 5
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
-; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s2, s4, s2
-; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_ashr_i32 s3, s3, 5
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_sext_inreg_v8i16_5:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 183f2edbf9035b..218d487aee4137 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -637,13 +637,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ;
 ; GFX8-LABEL: s_shl_v2i32_zext_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s2, 0x3fff
-; GFX8-NEXT:    s_mov_b32 s3, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v2i32_zext_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..c7603b7cec04a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -71,19 +71,22 @@ define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) {
 ;
 ; GFX8-LABEL: s_shl_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i8 %value, %amount
@@ -627,19 +630,19 @@ define amdgpu_ps i16 @s_shl_i16(i16 inreg %value, i16 inreg %amount) {
 ;
 ; GFX8-LABEL: s_shl_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i16 %value, %amount
@@ -791,13 +794,14 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 ; GFX8-LABEL: s_shl_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v2i16:
@@ -976,21 +980,23 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8-LABEL: s_shl_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s4, s6
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s7
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v4i16:
@@ -1157,37 +1163,41 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8-LABEL: s_shl_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s8, s12
-; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshl_b32 s5, s9, s13
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s6, s10, s14
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s7
 ; GFX8-NEXT:    s_lshl_b32 s7, s11, s15
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
-; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s2, s4, s2
-; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index 855687281ce9ab..49ba01aaf9e4fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -244,12 +244,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffffffc0
+; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffffffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat:
@@ -284,12 +284,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
 ; GFX8-NEXT:    s_sub_i32 s1, s1, 4
+; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffffffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo:
@@ -324,12 +324,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffffffc0
 ; GFX8-NEXT:    s_sub_i32 s0, s0, 4
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi:
@@ -365,14 +365,13 @@ define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX8-LABEL: s_sub_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16:
@@ -412,14 +411,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_fneg_lhs:
@@ -463,14 +461,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_fneg_rhs:
@@ -516,14 +513,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 6bb4e2d3dbe26e..7c9d8cba0fbb27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -35,15 +35,8 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
 ;
 ; GFX8-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s1
-; GFX8-NEXT:    s_mov_b32 s3, s2
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX900-LABEL: scalar_xnor_v2i16_one_use:
@@ -129,21 +122,10 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ;
 ; GFX8-LABEL: scalar_xnor_v4i16_one_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_mov_b32 s4, -1
 ; GFX8-NEXT:    s_mov_b32 s5, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX8-NEXT:    s_and_b32 s2, s0, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    s_and_b32 s6, s1, 0xffff
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[6:7], s[4:5]
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
-; GFX8-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX900-LABEL: scalar_xnor_v4i16_one_use:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 4cc384e9d27188..bcd75255acef44 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -103,13 +103,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    s_add_i32 s1, s2, s0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
 ; VI-NEXT:    s_add_i32 s2, s2, s0
-; VI-NEXT:    s_add_i32 s1, s1, s3
-; VI-NEXT:    s_and_b32 s0, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s2, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -170,16 +170,15 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_add_i32 s0, s2, s2
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -230,12 +229,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
 ; VI-NEXT:    s_add_i32 s2, s2, s3
-; VI-NEXT:    s_add_i32 s4, s4, s5
+; VI-NEXT:    s_add_i32 s5, s5, s4
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_lshl_b32 s3, s5, 16
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 8b6c8be9f37882..115cb40676da8c 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 6f52da2631b8a6..89735592cfa8a2 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -50,8 +50,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GISEL-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s2, s4, 0xffff
-; GISEL-NEXT:    s_brev_b32 s2, s2
+; GISEL-NEXT:    s_brev_b32 s2, s4
 ; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s2
@@ -80,11 +79,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX11-GISEL-NEXT:    s_brev_b32 s2, s4
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
 ; GFX11-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 29770738f83d57..e9ddc801b050c5 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -980,7 +980,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ;
 ; VI-LABEL: ps_mesa_inreg_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_and_b32 s0, 0xffff, s0
 ; VI-NEXT:    s_add_i32 s0, s0, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_short v[0:1], v0
@@ -988,9 +987,8 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ;
 ; GFX11-LABEL: ps_mesa_inreg_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
@@ -1140,20 +1138,20 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s0, 16
-; VI-NEXT:    s_lshr_b32 s1, s0, 24
-; VI-NEXT:    s_add_i32 s2, s2, s2
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s0, 24
 ; VI-NEXT:    s_bfe_u32 s3, s0, 0x80008
+; VI-NEXT:    s_add_i32 s2, s2, s2
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_lshl_b32 s1, s1, 24
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_or_b32 s1, s1, s2
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s2, s3, 8
 ; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -1227,8 +1225,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_add_i32 s2, s2, s2
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s1
@@ -1308,22 +1306,21 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v0, 4
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
-; VI-NEXT:    s_lshr_b32 s2, s0, 24
-; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 24
 ; VI-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
-; VI-NEXT:    s_add_i32 s4, s4, s4
+; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_lshl_b32 s2, s2, 24
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s3, s4, 8
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_or_b32 s0, s0, s3
+; VI-NEXT:    s_add_i32 s1, s1, s1
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    s_or_b32 s0, s0, s2
@@ -1423,37 +1420,37 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s3, s1, 16
-; VI-NEXT:    s_lshr_b32 s2, s1, 24
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 24
+; VI-NEXT:    s_lshr_b32 s4, s1, 16
+; VI-NEXT:    s_lshr_b32 s5, s1, 24
+; VI-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; VI-NEXT:    s_add_i32 s5, s5, s5
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
-; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_lshl_b32 s2, s2, 24
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_lshr_b32 s5, s0, 16
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    s_lshl_b32 s3, s6, 8
-; VI-NEXT:    s_lshr_b32 s4, s0, 24
-; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_bfe_u32 s7, s0, 0x80008
-; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_and_b32 s3, s5, 0xff
 ; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    s_lshl_b32 s2, s4, 24
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s7, 8
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
-; VI-NEXT:    s_lshl_b32 s3, s7, 8
+; VI-NEXT:    s_lshl_b32 s3, s6, 8
+; VI-NEXT:    s_or_b32 s1, s1, s5
 ; VI-NEXT:    s_or_b32 s0, s0, s3
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s1, s1, s4
 ; VI-NEXT:    s_or_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1595,69 +1592,69 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_lshr_b32 s4, s3, 24
+; VI-NEXT:    s_lshr_b32 s4, s0, 16
+; VI-NEXT:    s_lshr_b32 s5, s0, 24
+; VI-NEXT:    s_lshr_b32 s6, s1, 16
+; VI-NEXT:    s_lshr_b32 s7, s1, 24
+; VI-NEXT:    s_lshr_b32 s8, s2, 16
+; VI-NEXT:    s_lshr_b32 s9, s2, 24
+; VI-NEXT:    s_lshr_b32 s10, s3, 16
+; VI-NEXT:    s_lshr_b32 s11, s3, 24
+; VI-NEXT:    s_bfe_u32 s12, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s13, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s15, s3, 0x80008
+; VI-NEXT:    s_add_i32 s11, s11, s11
+; VI-NEXT:    s_add_i32 s10, s10, s10
+; VI-NEXT:    s_add_i32 s9, s9, s9
+; VI-NEXT:    s_add_i32 s8, s8, s8
+; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_add_i32 s6, s6, s6
 ; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_bfe_u32 s12, s3, 0x80008
 ; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
 ; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_lshl_b32 s4, s4, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_lshr_b32 s7, s2, 16
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s12, 8
-; VI-NEXT:    s_lshr_b32 s6, s2, 24
-; VI-NEXT:    s_add_i32 s7, s7, s7
-; VI-NEXT:    s_or_b32 s3, s3, s5
-; VI-NEXT:    s_bfe_u32 s13, s2, 0x80008
-; VI-NEXT:    s_add_i32 s6, s6, s6
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_and_b32 s5, s7, 0xff
-; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_or_b32 s3, s3, s4
-; VI-NEXT:    s_lshl_b32 s4, s6, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_lshr_b32 s9, s1, 16
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s13, 8
-; VI-NEXT:    s_lshr_b32 s8, s1, 24
-; VI-NEXT:    s_add_i32 s9, s9, s9
-; VI-NEXT:    s_or_b32 s2, s2, s5
-; VI-NEXT:    s_bfe_u32 s14, s1, 0x80008
-; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_and_b32 s5, s9, 0xff
 ; VI-NEXT:    s_add_i32 s14, s14, s14
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_or_b32 s2, s2, s4
-; VI-NEXT:    s_lshl_b32 s4, s8, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_lshr_b32 s11, s0, 16
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s14, 8
-; VI-NEXT:    s_lshr_b32 s10, s0, 24
-; VI-NEXT:    s_add_i32 s11, s11, s11
-; VI-NEXT:    s_or_b32 s1, s1, s5
-; VI-NEXT:    s_bfe_u32 s15, s0, 0x80008
-; VI-NEXT:    s_add_i32 s10, s10, s10
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_and_b32 s5, s11, 0xff
-; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_or_b32 s1, s1, s4
-; VI-NEXT:    s_lshl_b32 s4, s10, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s15, 8
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s14, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s13, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s15, 8
+; VI-NEXT:    s_lshl_b32 s5, s12, 8
+; VI-NEXT:    s_or_b32 s3, s3, s11
+; VI-NEXT:    s_or_b32 s2, s2, s9
+; VI-NEXT:    s_or_b32 s1, s1, s7
 ; VI-NEXT:    s_or_b32 s0, s0, s5
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_lshl_b32 s8, s8, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s3, s3, s10
+; VI-NEXT:    s_or_b32 s2, s2, s8
+; VI-NEXT:    s_or_b32 s1, s1, s6
 ; VI-NEXT:    s_or_b32 s0, s0, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1904,138 +1901,138 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v4, 16
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s9, s3, 16
-; VI-NEXT:    s_lshr_b32 s8, s3, 24
+; VI-NEXT:    s_lshr_b32 s8, s4, 16
+; VI-NEXT:    s_lshr_b32 s9, s4, 24
+; VI-NEXT:    s_lshr_b32 s10, s5, 16
+; VI-NEXT:    s_lshr_b32 s11, s5, 24
+; VI-NEXT:    s_lshr_b32 s12, s6, 16
+; VI-NEXT:    s_lshr_b32 s13, s6, 24
+; VI-NEXT:    s_lshr_b32 s14, s7, 16
+; VI-NEXT:    s_lshr_b32 s15, s7, 24
+; VI-NEXT:    s_bfe_u32 s24, s4, 0x80008
+; VI-NEXT:    s_bfe_u32 s25, s5, 0x80008
+; VI-NEXT:    s_bfe_u32 s26, s6, 0x80008
+; VI-NEXT:    s_bfe_u32 s27, s7, 0x80008
+; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_add_i32 s14, s14, s14
+; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_add_i32 s11, s11, s11
+; VI-NEXT:    s_add_i32 s10, s10, s10
 ; VI-NEXT:    s_add_i32 s9, s9, s9
-; VI-NEXT:    s_bfe_u32 s24, s3, 0x80008
 ; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    s_and_b32 s9, s9, 0xff
+; VI-NEXT:    s_lshr_b32 s16, s0, 16
+; VI-NEXT:    s_lshr_b32 s17, s0, 24
+; VI-NEXT:    s_lshr_b32 s18, s1, 16
+; VI-NEXT:    s_lshr_b32 s19, s1, 24
+; VI-NEXT:    s_lshr_b32 s20, s2, 16
+; VI-NEXT:    s_lshr_b32 s21, s2, 24
+; VI-NEXT:    s_lshr_b32 s22, s3, 16
+; VI-NEXT:    s_lshr_b32 s23, s3, 24
+; VI-NEXT:    s_lshl_b32 s15, s15, 8
+; VI-NEXT:    s_and_b32 s14, s14, 0xff
+; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_add_i32 s27, s27, s27
+; VI-NEXT:    s_lshl_b32 s13, s13, 8
+; VI-NEXT:    s_and_b32 s12, s12, 0xff
+; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_add_i32 s26, s26, s26
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
+; VI-NEXT:    s_add_i32 s5, s5, s5
+; VI-NEXT:    s_add_i32 s25, s25, s25
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_add_i32 s24, s24, s24
-; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_lshl_b32 s8, s8, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s11, s2, 16
+; VI-NEXT:    s_bfe_u32 s28, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s29, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s30, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s31, s3, 0x80008
+; VI-NEXT:    s_add_i32 s23, s23, s23
+; VI-NEXT:    s_add_i32 s22, s22, s22
+; VI-NEXT:    s_add_i32 s21, s21, s21
+; VI-NEXT:    s_add_i32 s20, s20, s20
+; VI-NEXT:    s_add_i32 s19, s19, s19
+; VI-NEXT:    s_add_i32 s18, s18, s18
+; VI-NEXT:    s_add_i32 s17, s17, s17
+; VI-NEXT:    s_add_i32 s16, s16, s16
+; VI-NEXT:    s_or_b32 s14, s14, s15
+; VI-NEXT:    s_and_b32 s7, s7, 0xff
+; VI-NEXT:    s_lshl_b32 s15, s27, 8
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s13, s26, 8
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s5, s5, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s25, 8
 ; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_lshl_b32 s9, s24, 8
-; VI-NEXT:    s_lshr_b32 s10, s2, 24
-; VI-NEXT:    s_add_i32 s11, s11, s11
-; VI-NEXT:    s_or_b32 s3, s3, s9
-; VI-NEXT:    s_bfe_u32 s25, s2, 0x80008
-; VI-NEXT:    s_add_i32 s10, s10, s10
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_and_b32 s9, s11, 0xff
-; VI-NEXT:    s_add_i32 s25, s25, s25
+; VI-NEXT:    s_lshl_b32 s23, s23, 8
+; VI-NEXT:    s_and_b32 s22, s22, 0xff
+; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_add_i32 s31, s31, s31
+; VI-NEXT:    s_lshl_b32 s21, s21, 8
+; VI-NEXT:    s_and_b32 s20, s20, 0xff
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_or_b32 s3, s3, s8
-; VI-NEXT:    s_lshl_b32 s8, s10, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s13, s1, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s25, 8
-; VI-NEXT:    s_lshr_b32 s12, s1, 24
-; VI-NEXT:    s_add_i32 s13, s13, s13
-; VI-NEXT:    s_or_b32 s2, s2, s9
-; VI-NEXT:    s_bfe_u32 s26, s1, 0x80008
-; VI-NEXT:    s_add_i32 s12, s12, s12
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_and_b32 s9, s13, 0xff
-; VI-NEXT:    s_add_i32 s26, s26, s26
+; VI-NEXT:    s_add_i32 s30, s30, s30
+; VI-NEXT:    s_lshl_b32 s19, s19, 8
+; VI-NEXT:    s_and_b32 s18, s18, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_or_b32 s2, s2, s8
-; VI-NEXT:    s_lshl_b32 s8, s12, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s15, s0, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s26, 8
-; VI-NEXT:    s_lshr_b32 s14, s0, 24
-; VI-NEXT:    s_add_i32 s15, s15, s15
-; VI-NEXT:    s_or_b32 s1, s1, s9
-; VI-NEXT:    s_bfe_u32 s27, s0, 0x80008
-; VI-NEXT:    s_add_i32 s14, s14, s14
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_and_b32 s9, s15, 0xff
-; VI-NEXT:    s_add_i32 s27, s27, s27
+; VI-NEXT:    s_add_i32 s29, s29, s29
+; VI-NEXT:    s_lshl_b32 s17, s17, 8
+; VI-NEXT:    s_and_b32 s16, s16, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_or_b32 s1, s1, s8
-; VI-NEXT:    s_lshl_b32 s8, s14, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s17, s7, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s0, s0, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s27, 8
-; VI-NEXT:    s_lshr_b32 s16, s7, 24
-; VI-NEXT:    s_add_i32 s17, s17, s17
-; VI-NEXT:    s_or_b32 s0, s0, s9
-; VI-NEXT:    s_bfe_u32 s28, s7, 0x80008
-; VI-NEXT:    s_add_i32 s16, s16, s16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_and_b32 s9, s17, 0xff
 ; VI-NEXT:    s_add_i32 s28, s28, s28
-; VI-NEXT:    s_add_i32 s7, s7, s7
-; VI-NEXT:    s_or_b32 s0, s0, s8
-; VI-NEXT:    s_lshl_b32 s8, s16, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s19, s6, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s7, s7, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s28, 8
-; VI-NEXT:    s_lshr_b32 s18, s6, 24
-; VI-NEXT:    s_add_i32 s19, s19, s19
-; VI-NEXT:    s_or_b32 s7, s7, s9
-; VI-NEXT:    s_bfe_u32 s29, s6, 0x80008
-; VI-NEXT:    s_add_i32 s18, s18, s18
+; VI-NEXT:    s_or_b32 s7, s7, s15
+; VI-NEXT:    s_or_b32 s6, s6, s13
+; VI-NEXT:    s_or_b32 s5, s5, s11
+; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_or_b32 s22, s22, s23
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_lshl_b32 s23, s31, 8
+; VI-NEXT:    s_or_b32 s20, s20, s21
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s21, s30, 8
+; VI-NEXT:    s_or_b32 s18, s18, s19
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s19, s29, 8
+; VI-NEXT:    s_or_b32 s16, s16, s17
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s17, s28, 8
+; VI-NEXT:    s_lshl_b32 s14, s14, 16
 ; VI-NEXT:    s_and_b32 s7, s7, 0xffff
-; VI-NEXT:    s_and_b32 s9, s19, 0xff
-; VI-NEXT:    s_add_i32 s29, s29, s29
-; VI-NEXT:    s_add_i32 s6, s6, s6
-; VI-NEXT:    s_or_b32 s7, s7, s8
-; VI-NEXT:    s_lshl_b32 s8, s18, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s21, s5, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s6, s6, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s29, 8
-; VI-NEXT:    s_lshr_b32 s20, s5, 24
-; VI-NEXT:    s_add_i32 s21, s21, s21
-; VI-NEXT:    s_or_b32 s6, s6, s9
-; VI-NEXT:    s_bfe_u32 s30, s5, 0x80008
-; VI-NEXT:    s_add_i32 s20, s20, s20
+; VI-NEXT:    s_lshl_b32 s12, s12, 16
 ; VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; VI-NEXT:    s_and_b32 s9, s21, 0xff
-; VI-NEXT:    s_add_i32 s30, s30, s30
-; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_or_b32 s6, s6, s8
-; VI-NEXT:    s_lshl_b32 s8, s20, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s23, s4, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s30, 8
-; VI-NEXT:    s_lshr_b32 s22, s4, 24
-; VI-NEXT:    s_add_i32 s23, s23, s23
-; VI-NEXT:    s_or_b32 s5, s5, s9
-; VI-NEXT:    s_bfe_u32 s31, s4, 0x80008
-; VI-NEXT:    s_add_i32 s22, s22, s22
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff
-; VI-NEXT:    s_and_b32 s9, s23, 0xff
-; VI-NEXT:    s_add_i32 s31, s31, s31
-; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    s_or_b32 s5, s5, s8
-; VI-NEXT:    s_lshl_b32 s8, s22, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s4, s4, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s31, 8
-; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_lshl_b32 s8, s8, 16
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s3, s3, s23
+; VI-NEXT:    s_or_b32 s2, s2, s21
+; VI-NEXT:    s_or_b32 s1, s1, s19
+; VI-NEXT:    s_or_b32 s0, s0, s17
+; VI-NEXT:    s_or_b32 s7, s7, s14
+; VI-NEXT:    s_or_b32 s6, s6, s12
+; VI-NEXT:    s_or_b32 s5, s5, s10
 ; VI-NEXT:    s_or_b32 s4, s4, s8
+; VI-NEXT:    s_lshl_b32 s22, s22, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_lshl_b32 s20, s20, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s18, s18, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s16, s16, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_or_b32 s3, s3, s22
+; VI-NEXT:    s_or_b32 s2, s2, s20
+; VI-NEXT:    s_or_b32 s1, s1, s18
+; VI-NEXT:    s_or_b32 s0, s0, s16
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 93e14a205f05d4..389d00c7e95ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1650,15 +1650,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_min_u32_e32 v2, 32, v2
-; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1696,11 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1711,11 +1702,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-GISEL-NEXT:    v_sub_nc_u16 v2, v2, 16
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -1727,13 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4588bee49f037f..1d3b308f346fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -657,8 +657,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -767,8 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index ccd23a91c35733..2ae4bf0a6ceecd 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1402,15 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1448,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-NEXT:    v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1463,9 +1455,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 4c7c8bc1c027d7..966619a090d288 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -629,8 +629,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -731,8 +730,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -1460,13 +1458,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x100, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -1503,14 +1496,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -1558,19 +1550,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_readfirstlane_b32 s2, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s3, v0
-; VI-NEXT:    s_lshl_b32 s2, s2, 8
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_or_b32 s3, s2, 0x10000
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_ff1_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-NEXT:    s_cselect_b32 s2, s3, 0xffff
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1613,8 +1598,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index e447429539e6ff..9c3f5f1cd672d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -635,12 +635,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_xor_b32 s3, s4, 0x8000
-; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_xor_b32 s2, s4, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -721,11 +716,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
-; VI-NEXT:    s_xor_b32 s3, s4, 0x8000
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_add_f16_e64 v1, s3, 2.0
-; VI-NEXT:    v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_sub_f16_e64 v1, 2.0, s4
+; VI-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a4bde5c9d82153..c06a3dab329822 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s2, 0xffff
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_lshr_b32 s2, s2, s5
-; VI-NEXT:    s_lshr_b32 s3, s4, s3
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshr_b32 s4, s5, s4
+; VI-NEXT:    s_lshr_b32 s2, s2, s3
+; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ef2698c1f774..7754a23ed80076 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -438,12 +438,12 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sext_i32_i8 s2, s2
-; VI-NEXT:    s_sext_i32_i8 s3, s3
-; VI-NEXT:    s_min_i32 s2, s2, s3
+; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; VI-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_min_i16_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -454,10 +454,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-NEXT:    s_sext_i32_i8 s3, s3
-; GFX9-NEXT:    s_min_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_i16_e32 v1, s2, v1
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -469,10 +469,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i8 s2, s2
-; GFX10-NEXT:    s_sext_i32_i8 s3, s3
-; GFX10-NEXT:    s_min_i32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX10-NEXT:    v_min_i16 v1, s2, s3
 ; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -484,11 +483,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i8 s2, s4
-; GFX11-NEXT:    s_sext_i32_i8 s3, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    s_bfe_i32 s2, s4, 0x80000
+; GFX11-NEXT:    s_bfe_i32 s3, s5, 0x80000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_min_i16 v1, s2, s3
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -593,30 +591,33 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s4, s2, 24
-; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
-; VI-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; VI-NEXT:    s_sext_i32_i8 s2, s2
-; VI-NEXT:    s_ashr_i32 s7, s3, 24
-; VI-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; VI-NEXT:    s_bfe_i32 s9, s3, 0x80008
-; VI-NEXT:    s_sext_i32_i8 s3, s3
-; VI-NEXT:    s_min_i32 s2, s2, s3
-; VI-NEXT:    s_min_i32 s3, s6, s9
-; VI-NEXT:    s_min_i32 s5, s5, s8
-; VI-NEXT:    s_min_i32 s4, s4, s7
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_lshl_b32 s3, s3, 8
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_lshl_b32 s4, s4, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s2, s2, s4
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_ashr_i32 s6, s2, 24
+; VI-NEXT:    s_lshr_b32 s8, s3, 16
+; VI-NEXT:    s_ashr_i32 s9, s3, 24
+; VI-NEXT:    s_bfe_i32 s8, s8, 0x80000
+; VI-NEXT:    v_mov_b32_e32 v0, s9
+; VI-NEXT:    s_bfe_i32 s5, s5, 0x80000
+; VI-NEXT:    s_sext_i32_i16 s7, s3
+; VI-NEXT:    v_min_i16_e32 v0, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    s_sext_i32_i16 s4, s2
+; VI-NEXT:    s_lshr_b32 s7, s7, 8
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-NEXT:    v_min_i16_e32 v1, s5, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 8
+; VI-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; VI-NEXT:    v_min_i16_e32 v1, s4, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    v_min_i16_e32 v2, s2, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -789,18 +790,16 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s4, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s5, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_min_i32 s4, s4, s5
-; VI-NEXT:    s_min_i32 s2, s2, s3
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_min_i16_e32 v0, s2, v0
+; VI-NEXT:    v_min_i16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -953,27 +952,23 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s6, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s7, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_min_i32 s6, s6, s8
-; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_min_i32 s7, s7, s9
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    s_lshl_b32 s2, s6, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    s_lshl_b32 s2, s7, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshr_b32 s6, s3, 16
+; VI-NEXT:    s_lshr_b32 s7, s1, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_min_i16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_i16_e32 v0, s1, v0
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    v_or_b32_e32 v1, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_min_i16_e32 v2, s0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2661,19 +2656,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    flat_load_ushort v4, v[0:1]
-; VI-NEXT:    flat_load_ushort v5, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    flat_load_ushort v4, v[2:3]
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; VI-NEXT:    flat_store_short v[0:1], v4
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
@@ -2687,7 +2685,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -2703,7 +2701,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_short v0, v1, s[8:9]
@@ -2716,11 +2714,15 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
@@ -3174,43 +3176,39 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s11, 16
-; VI-NEXT:    s_lshr_b32 s4, s10, 16
-; VI-NEXT:    s_and_b32 s5, s10, 0xffff
-; VI-NEXT:    s_lshr_b32 s10, s15, 16
-; VI-NEXT:    s_and_b32 s3, s11, 0xffff
-; VI-NEXT:    s_and_b32 s11, s15, 0xffff
-; VI-NEXT:    s_lshr_b32 s15, s14, 16
-; VI-NEXT:    s_min_u32 s2, s2, s10
-; VI-NEXT:    s_lshr_b32 s6, s9, 16
-; VI-NEXT:    s_and_b32 s7, s9, 0xffff
-; VI-NEXT:    s_lshr_b32 s9, s8, 16
-; VI-NEXT:    s_and_b32 s14, s14, 0xffff
-; VI-NEXT:    s_lshr_b32 s16, s13, 16
-; VI-NEXT:    s_lshr_b32 s17, s12, 16
-; VI-NEXT:    s_min_u32 s4, s4, s15
-; VI-NEXT:    s_min_u32 s3, s3, s11
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s8, s8, 0xffff
-; VI-NEXT:    s_and_b32 s13, s13, 0xffff
-; VI-NEXT:    s_and_b32 s12, s12, 0xffff
-; VI-NEXT:    s_min_u32 s9, s9, s17
-; VI-NEXT:    s_min_u32 s6, s6, s16
-; VI-NEXT:    s_min_u32 s5, s5, s14
-; VI-NEXT:    s_or_b32 s2, s3, s2
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
-; VI-NEXT:    s_min_u32 s8, s8, s12
-; VI-NEXT:    s_min_u32 s7, s7, s13
-; VI-NEXT:    s_or_b32 s3, s5, s3
-; VI-NEXT:    s_lshl_b32 s4, s6, 16
-; VI-NEXT:    s_lshl_b32 s5, s9, 16
-; VI-NEXT:    s_or_b32 s4, s7, s4
-; VI-NEXT:    s_or_b32 s5, s8, s5
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    s_lshr_b32 s2, s15, 16
+; VI-NEXT:    s_lshr_b32 s3, s11, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s15
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_min_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_u16_e32 v0, s11, v0
+; VI-NEXT:    s_lshr_b32 s2, s14, 16
+; VI-NEXT:    s_lshr_b32 s3, s10, 16
+; VI-NEXT:    v_or_b32_e32 v3, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_min_u16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v1, s14
+; VI-NEXT:    v_min_u16_e32 v1, s10, v1
+; VI-NEXT:    s_lshr_b32 s2, s13, 16
+; VI-NEXT:    s_lshr_b32 s3, s9, 16
+; VI-NEXT:    v_or_b32_e32 v2, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_min_u16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v1, s13
+; VI-NEXT:    v_min_u16_e32 v1, s9, v1
+; VI-NEXT:    s_lshr_b32 s2, s12, 16
+; VI-NEXT:    s_lshr_b32 s3, s8, 16
+; VI-NEXT:    v_or_b32_e32 v1, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_min_u16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_min_u16_e32 v4, s8, v4
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -3536,12 +3534,11 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sext_i32_i16 s3, s2
-; VI-NEXT:    s_ashr_i32 s2, s2, 16
-; VI-NEXT:    s_min_i32 s2, s3, s2
+; VI-NEXT:    s_lshr_b32 s3, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_min_i16_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -3551,10 +3548,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s3, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_min_i32 s2, s3, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_i16_e32 v1, s2, v1
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3565,10 +3561,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s3, s2
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_min_i32 s2, s3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX10-NEXT:    v_min_i16 v1, s2, s3
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -3579,11 +3573,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s2, s4
-; GFX11-NEXT:    s_ashr_i32 s3, s4, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    s_lshr_b32 s2, s4, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_min_i16 v1, s4, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 5a1cc72644d47d..b1066e0f8f26ad 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; NOSDWA-NEXT:    flat_load_dword v1, v[0:1]
 ; NOSDWA-NEXT:    flat_load_dword v2, v[2:3]
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, s4
-; NOSDWA-NEXT:    s_waitcnt vmcnt(1)
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; NOSDWA-NEXT:    v_add_u32_e32 v3, vcc, v1, v2
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; NOSDWA-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; NOSDWA-NEXT:    v_add_u32_e32 v2, vcc, v3, v4
-; NOSDWA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; NOSDWA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; NOSDWA-NEXT:    v_or_b32_e32 v2, v1, v2
+; NOSDWA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; NOSDWA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; NOSDWA-NEXT:    v_or_b32_e32 v2, v3, v1
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s5
 ; NOSDWA-NEXT:    flat_store_dword v[0:1], v2
 ; NOSDWA-NEXT:    s_endpgm
@@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX89-NEXT:    flat_load_dword v2, v[2:3]
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX89-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX89-NEXT:    v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT:    v_add_u32_e32 v3, vcc, v1, v2
+; GFX89-NEXT:    v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT:    v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX89-NEXT:    flat_store_dword v[0:1], v2
 ; GFX89-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 9b9f03ff74aa3f..44dd0b6e27e740 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_lshl_b32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s3, 16
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_lshl_b32 s0, s1, s0
 ; VI-NEXT:    s_lshl_b32 s1, s2, s3
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 3446e0384cc545..3c7569c4601e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -22,23 +22,19 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_sub_i32 s3, 0, s4
-; VI-NEXT:    s_ashr_i32 s5, s4, 16
-; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sub_i32 s2, 0, s2
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_max_i32 s3, s4, s3
-; VI-NEXT:    s_max_i32 s2, s5, s2
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_add_i32 s2, s2, 0x20000
+; VI-NEXT:    s_sub_i32 s2, 0, s4
+; VI-NEXT:    s_lshr_b32 s3, s4, 16
+; VI-NEXT:    s_sub_i32 s5, 0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_max_i16_e32 v1, s4, v1
+; VI-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x20000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -171,23 +167,19 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_sub_i32 s3, 0, s4
-; VI-NEXT:    s_ashr_i32 s5, s4, 16
-; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sub_i32 s2, 0, s2
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_max_i32 s3, s4, s3
-; VI-NEXT:    s_max_i32 s2, s5, s2
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_add_i32 s2, s2, 0x20000
+; VI-NEXT:    s_sub_i32 s2, 0, s4
+; VI-NEXT:    s_lshr_b32 s3, s4, 16
+; VI-NEXT:    s_sub_i32 s5, 0, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_max_i16_e32 v1, s4, v1
+; VI-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x20000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -331,37 +323,29 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_sub_i32 s6, 0, s3
-; VI-NEXT:    s_sub_i32 s7, 0, s2
-; VI-NEXT:    s_sub_i32 s5, 0, s5
-; VI-NEXT:    s_sub_i32 s4, 0, s4
-; VI-NEXT:    s_ashr_i32 s8, s2, 16
-; VI-NEXT:    s_ashr_i32 s9, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s7, s7
-; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sext_i32_i16 s5, s5
-; VI-NEXT:    s_max_i32 s3, s3, s6
-; VI-NEXT:    s_max_i32 s2, s2, s7
-; VI-NEXT:    s_max_i32 s5, s9, s5
-; VI-NEXT:    s_max_i32 s4, s8, s4
-; VI-NEXT:    s_add_i32 s2, s2, 2
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s3, s5, s3
-; VI-NEXT:    s_or_b32 s2, s4, s2
-; VI-NEXT:    s_add_i32 s3, s3, 0x20000
-; VI-NEXT:    s_add_i32 s2, s2, 0x20000
+; VI-NEXT:    s_lshr_b32 s7, s2, 16
+; VI-NEXT:    s_lshr_b32 s6, s3, 16
+; VI-NEXT:    s_sub_i32 s9, 0, s7
+; VI-NEXT:    s_sub_i32 s8, 0, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_sub_i32 s4, 0, s3
+; VI-NEXT:    s_sub_i32 s5, 0, s2
+; VI-NEXT:    v_max_i16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_max_i16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_max_i16_e32 v2, s2, v2
+; VI-NEXT:    v_max_i16_e32 v3, s3, v3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v3
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x20000, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x20000, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -559,28 +543,23 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_ashr_i32 s2, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s3, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    s_lshr_b32 s2, s1, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_max_i16_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v8, s0, v7
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_max_i32 s4, s2, s3
-; VI-NEXT:    s_max_i32 s5, s0, s1
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_and_b32 s5, s5, 0xffff
-; VI-NEXT:    s_min_i32 s2, s2, s3
-; VI-NEXT:    s_min_i32 s0, s0, s1
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    s_lshl_b32 s1, s2, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_or_b32_e32 v6, v8, v6
+; VI-NEXT:    v_min_i16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_i16_e32 v5, s0, v7
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    flat_store_dword v[0:1], v6
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    flat_store_dword v[2:3], v0
+; VI-NEXT:    flat_store_dword v[2:3], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -661,12 +640,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v6, v4, v5
+; VI-NEXT:    v_max_i16_sdwa v7, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_i16_e32 v8, v4, v5
+; VI-NEXT:    v_min_i16_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v5, v6, v7
+; VI-NEXT:    v_or_b32_e32 v4, v8, v4
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v4
@@ -748,40 +727,30 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshr_b32 s0, s7, 16
+; VI-NEXT:    s_lshr_b32 s1, s5, 16
+; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v8, s7
+; VI-NEXT:    s_lshr_b32 s0, s6, 16
+; VI-NEXT:    s_lshr_b32 s1, s4, 16
+; VI-NEXT:    v_max_i16_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v5, s5, v8
+; VI-NEXT:    v_mov_b32_e32 v9, s0
+; VI-NEXT:    v_mov_b32_e32 v10, s1
+; VI-NEXT:    v_mov_b32_e32 v11, s6
+; VI-NEXT:    v_or_b32_e32 v5, v5, v4
+; VI-NEXT:    v_max_i16_sdwa v4, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v12, s4, v11
+; VI-NEXT:    v_min_i16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_i16_e32 v7, s5, v8
+; VI-NEXT:    v_or_b32_e32 v4, v12, v4
+; VI-NEXT:    v_or_b32_e32 v7, v7, v6
+; VI-NEXT:    v_min_i16_sdwa v6, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_i16_e32 v8, s4, v11
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_ashr_i32 s0, s5, 16
-; VI-NEXT:    s_ashr_i32 s1, s4, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s5
-; VI-NEXT:    s_sext_i32_i16 s3, s4
-; VI-NEXT:    s_ashr_i32 s4, s7, 16
-; VI-NEXT:    s_ashr_i32 s5, s6, 16
-; VI-NEXT:    s_sext_i32_i16 s7, s7
-; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_max_i32 s8, s1, s5
-; VI-NEXT:    s_max_i32 s9, s0, s4
-; VI-NEXT:    s_max_i32 s10, s3, s6
-; VI-NEXT:    s_max_i32 s11, s2, s7
-; VI-NEXT:    s_min_i32 s0, s0, s4
-; VI-NEXT:    s_min_i32 s2, s2, s7
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_and_b32 s11, s11, 0xffff
-; VI-NEXT:    s_lshl_b32 s8, s8, 16
-; VI-NEXT:    s_and_b32 s10, s10, 0xffff
-; VI-NEXT:    s_min_i32 s1, s1, s5
-; VI-NEXT:    s_min_i32 s3, s3, s6
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s9, s11, s9
-; VI-NEXT:    s_or_b32 s8, s10, s8
-; VI-NEXT:    s_or_b32 s0, s2, s0
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_and_b32 s2, s3, 0xffff
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_mov_b32_e32 v5, s9
-; VI-NEXT:    s_or_b32 s1, s2, s1
-; VI-NEXT:    v_mov_b32_e32 v6, s1
-; VI-NEXT:    v_mov_b32_e32 v7, s0
+; VI-NEXT:    v_or_b32_e32 v6, v8, v6
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[6:7]
@@ -899,42 +868,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_readfirstlane_b32 s0, v4
-; VI-NEXT:    v_readfirstlane_b32 s1, v5
-; VI-NEXT:    s_ashr_i32 s3, s0, 16
-; VI-NEXT:    s_ashr_i32 s5, s1, 16
-; VI-NEXT:    s_cmp_gt_i32 s3, s5
-; VI-NEXT:    s_sext_i32_i16 s2, s0
-; VI-NEXT:    s_sext_i32_i16 s4, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, s3, s5
-; VI-NEXT:    s_cselect_b32 s3, s5, s3
-; VI-NEXT:    s_lshl_b32 s5, s0, 16
-; VI-NEXT:    s_cmp_gt_i32 s2, s4
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, s2, s4
-; VI-NEXT:    s_cselect_b32 s1, s4, s2
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
-; VI-NEXT:    s_lshl_b32 s2, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s5
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    v_and_b32_e32 v4, 3, v4
-; VI-NEXT:    v_mov_b32_e32 v6, s1
-; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_ashrrev_i32_e32 v10, 16, v4
+; VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v5
+; VI-NEXT:    v_bfe_i32 v6, v4, 0, 16
+; VI-NEXT:    v_bfe_i32 v7, v5, 0, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, v10, v11
+; VI-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], v6, v7
+; VI-NEXT:    v_cndmask_b32_e64 v6, v5, v4, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT:    flat_store_dword v[0:1], v6
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_dword v[2:3], v6
+; VI-NEXT:    v_or_b32_e32 v0, v9, v5
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v0, 3, v0
+; VI-NEXT:    flat_store_dword v[2:3], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_byte v[0:1], v4
+; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -1020,27 +981,24 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_lshr_b32 s2, s0, 16
-; VI-NEXT:    s_lshr_b32 s3, s1, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_max_u32 s5, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_max_u32 s4, s0, s1
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_min_u32 s0, s0, s1
-; VI-NEXT:    s_min_u32 s1, s2, s3
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_or_b32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_lshr_b32 s2, s1, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_max_u16_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_u16_e32 v8, s0, v7
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_or_b32_e32 v6, v8, v6
+; VI-NEXT:    v_min_u16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_u16_e32 v5, s0, v7
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    flat_store_dword v[0:1], v6
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    flat_store_dword v[2:3], v0
+; VI-NEXT:    flat_store_dword v[2:3], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index cd06a060a50cd8..1540c3e5c403fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -187,15 +187,14 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v1
-; VI-NEXT:    v_readfirstlane_b32 s1, v0
-; VI-NEXT:    s_ashr_i32 s2, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-NEXT:    s_lshr_b32 s2, s1, 16
 ; VI-NEXT:    s_ashr_i32 s3, s0, 16
 ; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s1, s0
-; VI-NEXT:    s_ashr_i32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_ashr_i32 s2, s3, s2
+; VI-NEXT:    s_ashr_i32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -282,43 +281,41 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; VI-LABEL: ashr_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    v_readfirstlane_b32 s1, v3
-; VI-NEXT:    v_readfirstlane_b32 s2, v0
-; VI-NEXT:    v_readfirstlane_b32 s3, v1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s10, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s11, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s2, s0
-; VI-NEXT:    s_ashr_i32 s2, s9, s11
-; VI-NEXT:    s_ashr_i32 s1, s3, s1
-; VI-NEXT:    s_ashr_i32 s3, s8, s10
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_or_b32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-NEXT:    v_readfirstlane_b32 s5, v2
+; VI-NEXT:    v_readfirstlane_b32 s6, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v3
+; VI-NEXT:    s_lshr_b32 s8, s7, 16
+; VI-NEXT:    s_ashr_i32 s9, s6, 16
+; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_lshr_b32 s10, s5, 16
+; VI-NEXT:    s_ashr_i32 s11, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_ashr_i32 s8, s9, s8
+; VI-NEXT:    s_ashr_i32 s6, s6, s7
+; VI-NEXT:    s_ashr_i32 s7, s11, s10
+; VI-NEXT:    s_ashr_i32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s5, s8, 16
+; VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; VI-NEXT:    s_lshl_b32 s7, s7, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s5, s6, s5
+; VI-NEXT:    s_or_b32 s4, s4, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: ashr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 5a821db6ff0408..327a85e80da9dc 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -117,23 +117,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_lshr_b32 s5, s7, 16
-; VI-NEXT:    s_sub_i32 s6, s6, s7
-; VI-NEXT:    s_sub_i32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s5, s6, 0xffff
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_sub_i32 s1, s2, s0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
+; VI-NEXT:    s_sub_i32 s0, s2, s0
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_sub_v2i16:
@@ -235,9 +233,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_sub_i32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s3, 16
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_sub_i32 s0, s1, s0
 ; VI-NEXT:    s_sub_i32 s1, s2, s3
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7a1f05f56a7517..f074f7bf67f770 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -323,7 +323,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s0, 0xffffff00
+; VI-NEXT:    s_and_b32 s1, s0, 0xff00
 ; VI-NEXT:    s_add_i32 s0, s0, 12
 ; VI-NEXT:    s_or_b32 s0, s0, 4
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff



More information about the llvm-commits mailing list