[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 4 01:49:20 PST 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141590
>From a4a84924c96ba0af48010b488e646832329d9a8d Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 27 May 2025 12:10:19 +0200
Subject: [PATCH 1/2] [AMDGPU] Add BFX Formation Combines to RegBankCombiner
They're relatively safe to use there I believe. The only new registers
they may create are the constants for the BFX. For those, borrow the
RC from the source register.
Fixes #140040
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 29 +
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 2 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 119 ++-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 747 +++++++-------
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 946 +++++++++---------
.../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 13 +-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 232 ++---
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 354 ++++---
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 17 +-
9 files changed, 1176 insertions(+), 1283 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index f0fbe0135353f..eb52154df4bb5 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4760,10 +4760,17 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits())
return false;
+ const RegisterBank *RB = getRegBank(ShiftSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto Cst1 = B.buildConstant(ExtractTy, ShiftImm);
auto Cst2 = B.buildConstant(ExtractTy, Width);
B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2);
+
+ if (RB) {
+ MRI.setRegBank(Cst1.getReg(0), *RB);
+ MRI.setRegBank(Cst2.getReg(0), *RB);
+ }
};
return true;
}
@@ -4798,10 +4805,18 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI,
return false;
uint64_t Width = APInt(Size, AndImm).countr_one();
+
+ const RegisterBank *RB = getRegBank(ShiftSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
auto LSBCst = B.buildConstant(ExtractTy, LSBImm);
B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst});
+
+ if (RB) {
+ MRI.setRegBank(WidthCst.getReg(0), *RB);
+ MRI.setRegBank(LSBCst.getReg(0), *RB);
+ }
};
return true;
}
@@ -4848,10 +4863,17 @@ bool CombinerHelper::matchBitfieldExtractFromShr(
const int64_t Pos = ShrAmt - ShlAmt;
const int64_t Width = Size - ShrAmt;
+ const RegisterBank *RB = getRegBank(ShlSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
auto PosCst = B.buildConstant(ExtractTy, Pos);
B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});
+
+ if (RB) {
+ MRI.setRegBank(WidthCst.getReg(0), *RB);
+ MRI.setRegBank(PosCst.getReg(0), *RB);
+ }
};
return true;
}
@@ -4906,10 +4928,17 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
return false;
+ const RegisterBank *RB = getRegBank(AndSrc);
+
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
auto PosCst = B.buildConstant(ExtractTy, Pos);
B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
+
+ if (RB) {
+ MRI.setRegBank(WidthCst.getReg(0), *RB);
+ MRI.setRegBank(PosCst.getReg(0), *RB);
+ }
};
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c6afae0378ff0..3639e2b960e0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -250,5 +250,5 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
- d16_load, lower_uniform_sbfx, lower_uniform_ubfx]> {
+ d16_load, lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ecd7cc24fd920..c09c6400286e1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -811,16 +811,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX8-LABEL: s_ashr_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_ashr_i32 s0, s0, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s2
-; GFX8-NEXT: s_ashr_i32 s1, s1, s3
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s2, s1, 16
+; GFX8-NEXT: s_sext_i32_i16 s3, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT: s_ashr_i32 s0, s0, s2
+; GFX8-NEXT: s_ashr_i32 s1, s3, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ashr_v2i16:
@@ -1014,26 +1013,24 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX8-LABEL: s_ashr_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_ashr_i32 s0, s0, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
-; GFX8-NEXT: s_lshr_b32 s5, s1, 16
-; GFX8-NEXT: s_ashr_i32 s2, s2, s6
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_lshr_b32 s7, s3, 16
-; GFX8-NEXT: s_ashr_i32 s1, s1, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s5
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_ashr_i32 s3, s3, s7
+; GFX8-NEXT: s_lshr_b32 s4, s2, 16
+; GFX8-NEXT: s_sext_i32_i16 s6, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT: s_lshr_b32 s5, s3, 16
+; GFX8-NEXT: s_ashr_i32 s0, s0, s4
+; GFX8-NEXT: s_sext_i32_i16 s4, s1
+; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT: s_ashr_i32 s2, s6, s2
+; GFX8-NEXT: s_ashr_i32 s1, s1, s5
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_ashr_i32 s3, s4, s3
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_or_b32 s0, s2, s0
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ashr_v4i16:
@@ -1223,46 +1220,42 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX8-LABEL: s_ashr_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_lshr_b32 s12, s4, 16
-; GFX8-NEXT: s_ashr_i32 s0, s0, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s8
-; GFX8-NEXT: s_lshr_b32 s9, s1, 16
-; GFX8-NEXT: s_ashr_i32 s4, s4, s12
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_lshr_b32 s13, s5, 16
-; GFX8-NEXT: s_ashr_i32 s1, s1, s5
-; GFX8-NEXT: s_sext_i32_i16 s5, s9
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
-; GFX8-NEXT: s_ashr_i32 s5, s5, s13
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_lshr_b32 s8, s4, 16
+; GFX8-NEXT: s_sext_i32_i16 s12, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
+; GFX8-NEXT: s_lshr_b32 s9, s5, 16
+; GFX8-NEXT: s_ashr_i32 s0, s0, s8
+; GFX8-NEXT: s_sext_i32_i16 s8, s1
+; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT: s_lshr_b32 s10, s6, 16
+; GFX8-NEXT: s_ashr_i32 s4, s12, s4
+; GFX8-NEXT: s_ashr_i32 s5, s8, s5
+; GFX8-NEXT: s_ashr_i32 s1, s1, s9
+; GFX8-NEXT: s_sext_i32_i16 s8, s2
+; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_lshr_b32 s14, s6, 16
-; GFX8-NEXT: s_ashr_i32 s2, s2, s6
-; GFX8-NEXT: s_sext_i32_i16 s6, s10
-; GFX8-NEXT: s_or_b32 s0, s0, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX8-NEXT: s_lshr_b32 s11, s3, 16
-; GFX8-NEXT: s_ashr_i32 s6, s6, s14
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_lshr_b32 s11, s7, 16
+; GFX8-NEXT: s_ashr_i32 s6, s8, s6
+; GFX8-NEXT: s_ashr_i32 s2, s2, s10
+; GFX8-NEXT: s_sext_i32_i16 s8, s3
+; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_lshr_b32 s15, s7, 16
-; GFX8-NEXT: s_ashr_i32 s3, s3, s7
-; GFX8-NEXT: s_sext_i32_i16 s7, s11
-; GFX8-NEXT: s_or_b32 s1, s1, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
-; GFX8-NEXT: s_ashr_i32 s7, s7, s15
+; GFX8-NEXT: s_ashr_i32 s3, s3, s11
+; GFX8-NEXT: s_or_b32 s0, s4, s0
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_or_b32 s2, s2, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_ashr_i32 s7, s8, s7
+; GFX8-NEXT: s_or_b32 s1, s4, s1
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_or_b32 s3, s3, s4
+; GFX8-NEXT: s_or_b32 s2, s4, s2
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ashr_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 54efb26ae1e01..1b96f2f840938 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -40,8 +40,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX8-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -70,8 +69,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX9-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -99,8 +97,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX10-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX10-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -129,40 +126,38 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX11-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX11-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x60001
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
@@ -345,10 +340,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshl_i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX8-NEXT: s_and_b32 s3, s2, 7
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -356,10 +351,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX9-LABEL: s_fshl_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX9-NEXT: s_and_b32 s3, s2, 7
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -367,10 +362,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX10-LABEL: s_fshl_i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX10-NEXT: s_and_b32 s3, s2, 7
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -378,10 +373,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX11-LABEL: s_fshl_i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX11-NEXT: s_and_b32 s3, s2, 7
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -462,42 +457,17 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
}
define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i8_4:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 4
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i8_4:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 s1, s1, 4
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i8_4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_lshr_b32 s1, s1, 4
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i8_4:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 4
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i8_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 4
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i8_4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -555,42 +525,17 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
}
define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i8_5:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 5
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x50003
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i8_5:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 5
-; GFX8-NEXT: s_lshr_b32 s1, s1, 3
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i8_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 5
-; GFX9-NEXT: s_lshr_b32 s1, s1, 3
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i8_5:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 5
-; GFX10-NEXT: s_lshr_b32 s1, s1, 3
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i8_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 5
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x50003
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i8_5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_lshr_b32 s1, s1, 3
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x50003
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -673,23 +618,23 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX8-LABEL: s_fshl_v2i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s1, 8
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s5, s2, 8
-; GFX8-NEXT: s_and_b32 s6, s2, 7
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s5, s2, 7
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s5, 7
-; GFX8-NEXT: s_and_b32 s2, s4, 0xff
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_andn2_b32 s3, 7, s5
-; GFX8-NEXT: s_lshr_b32 s2, s2, s3
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s5
+; GFX8-NEXT: s_bfe_u32 s5, s1, 0x70001
+; GFX8-NEXT: s_lshr_b32 s4, s2, 8
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_lshr_b32 s2, s5, s2
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s4, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_andn2_b32 s3, 7, s4
+; GFX8-NEXT: s_lshr_b32 s1, s1, s3
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
@@ -698,23 +643,23 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX9-LABEL: s_fshl_v2i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s4, s1, 8
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshr_b32 s5, s2, 8
-; GFX9-NEXT: s_and_b32 s6, s2, 7
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s5, s2, 7
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
-; GFX9-NEXT: s_lshl_b32 s0, s0, s6
-; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s5, 7
-; GFX9-NEXT: s_and_b32 s2, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_lshr_b32 s2, s2, 1
-; GFX9-NEXT: s_andn2_b32 s3, 7, s5
-; GFX9-NEXT: s_lshr_b32 s2, s2, s3
-; GFX9-NEXT: s_or_b32 s1, s1, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s5
+; GFX9-NEXT: s_bfe_u32 s5, s1, 0x70001
+; GFX9-NEXT: s_lshr_b32 s4, s2, 8
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX9-NEXT: s_lshr_b32 s2, s5, s2
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s4, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_lshl_b32 s2, s3, s2
+; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_andn2_b32 s3, 7, s4
+; GFX9-NEXT: s_lshr_b32 s1, s1, s3
+; GFX9-NEXT: s_or_b32 s1, s2, s1
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
@@ -723,24 +668,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX10-LABEL: s_fshl_v2i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s4, s1, 8
-; GFX10-NEXT: s_lshr_b32 s5, s2, 8
-; GFX10-NEXT: s_and_b32 s6, s2, 7
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
+; GFX10-NEXT: s_and_b32 s4, s2, 7
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, s6
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_bfe_u32 s4, s1, 0x70001
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX10-NEXT: s_lshr_b32 s5, s2, 8
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_and_b32 s6, s5, 7
-; GFX10-NEXT: s_lshr_b32 s4, s4, 1
-; GFX10-NEXT: s_andn2_b32 s5, 7, s5
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_andn2_b32 s5, 7, s5
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: s_lshl_b32 s3, s3, s6
-; GFX10-NEXT: s_lshr_b32 s4, s4, s5
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_or_b32 s2, s3, s4
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s1, s2, 0xff
+; GFX10-NEXT: s_lshr_b32 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s2, s4, s2
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -748,24 +693,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX11-LABEL: s_fshl_v2i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
-; GFX11-NEXT: s_lshr_b32 s5, s2, 8
-; GFX11-NEXT: s_and_b32 s6, s2, 7
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
+; GFX11-NEXT: s_and_b32 s4, s2, 7
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshl_b32 s0, s0, s4
+; GFX11-NEXT: s_bfe_u32 s4, s1, 0x70001
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX11-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_b32 s6, s5, 7
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s5
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s5
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
; GFX11-NEXT: s_lshl_b32 s3, s3, s6
-; GFX11-NEXT: s_lshr_b32 s4, s4, s5
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_and_b32 s1, s2, 0xff
+; GFX11-NEXT: s_lshr_b32 s1, s1, s5
+; GFX11-NEXT: s_lshr_b32 s2, s4, s2
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1755,63 +1700,63 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) {
; GFX6-LABEL: s_fshl_v2i24:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80008
+; GFX6-NEXT: s_and_b32 s6, s0, 0xff
+; GFX6-NEXT: s_lshl_b32 s7, s7, 8
+; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24
+; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
-; GFX6-NEXT: s_lshr_b32 s6, s0, 16
-; GFX6-NEXT: s_and_b32 s8, s0, 0xff
-; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT: s_lshl_b32 s9, s9, 8
-; GFX6-NEXT: s_lshr_b32 s7, s1, 8
-; GFX6-NEXT: s_or_b32 s8, s8, s9
-; GFX6-NEXT: s_and_b32 s6, s6, 0xff
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
+; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX6-NEXT: s_lshl_b32 s7, s7, 16
+; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_and_b32 s7, s1, 0xff
; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
-; GFX6-NEXT: s_lshl_b32 s6, s6, 16
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_and_b32 s0, s7, 0xff
-; GFX6-NEXT: v_not_b32_e32 v3, 23
-; GFX6-NEXT: s_or_b32 s6, s8, s6
+; GFX6-NEXT: s_bfe_u32 s0, s1, 0x80008
+; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 24
+; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008
-; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
+; GFX6-NEXT: s_bfe_u32 s1, s2, 0x80008
+; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshr_b32 s0, s2, 16
-; GFX6-NEXT: s_and_b32 s7, s2, 0xff
-; GFX6-NEXT: s_lshl_b32 s8, s8, 8
-; GFX6-NEXT: s_lshr_b32 s1, s3, 8
-; GFX6-NEXT: s_or_b32 s7, s7, s8
-; GFX6-NEXT: s_and_b32 s0, s0, 0xff
-; GFX6-NEXT: s_and_b32 s3, s3, 0xff
+; GFX6-NEXT: s_and_b32 s0, s2, 0xff
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
+; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_bfe_u32 s1, s2, 0x80010
+; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: v_not_b32_e32 v3, 23
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_and_b32 s1, s3, 0xff
; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
-; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: s_or_b32 s0, s7, s0
+; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
+; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 24
+; GFX6-NEXT: s_bfe_u32 s1, s3, 0x80008
+; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
+; GFX6-NEXT: s_bfe_u32 s2, s4, 0x80008
; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
-; GFX6-NEXT: s_lshr_b32 s1, s4, 16
-; GFX6-NEXT: s_and_b32 s3, s4, 0xff
-; GFX6-NEXT: s_lshl_b32 s7, s7, 8
-; GFX6-NEXT: s_or_b32 s3, s3, s7
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: s_or_b32 s1, s3, s1
+; GFX6-NEXT: s_and_b32 s1, s4, 0xff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 8
+; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
+; GFX6-NEXT: s_or_b32 s1, s1, s2
+; GFX6-NEXT: s_bfe_u32 s2, s4, 0x80010
+; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2
-; GFX6-NEXT: s_lshr_b32 s2, s5, 8
-; GFX6-NEXT: s_and_b32 s3, s5, 0xff
+; GFX6-NEXT: s_and_b32 s2, s5, 0xff
; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24
-; GFX6-NEXT: s_and_b32 s2, s2, 0xff
+; GFX6-NEXT: v_alignbit_b32 v5, s2, v5, 24
+; GFX6-NEXT: s_bfe_u32 s2, s5, 0x80008
+; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
@@ -1866,67 +1811,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
;
; GFX8-LABEL: s_fshl_v2i24:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s6, s0, 8
-; GFX8-NEXT: s_and_b32 s6, s6, 0xff
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX8-NEXT: s_lshr_b32 s7, s0, 16
-; GFX8-NEXT: s_lshr_b32 s8, s0, 24
-; GFX8-NEXT: s_and_b32 s0, s0, 0xff
-; GFX8-NEXT: s_lshl_b32 s6, s6, 8
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_or_b32 s0, s0, s6
-; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: s_lshr_b32 s9, s1, 8
-; GFX8-NEXT: s_lshl_b32 s6, s6, 16
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_or_b32 s0, s0, s6
-; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: s_or_b32 s1, s8, s1
-; GFX8-NEXT: s_lshl_b32 s6, s6, 16
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80008
+; GFX8-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NEXT: s_lshl_b32 s8, s8, 8
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NEXT: s_or_b32 s7, s7, s8
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX8-NEXT: s_or_b32 s1, s1, s6
-; GFX8-NEXT: s_lshr_b32 s6, s2, 8
+; GFX8-NEXT: s_or_b32 s0, s7, s0
+; GFX8-NEXT: s_and_b32 s7, s1, 0xff
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_and_b32 s6, s6, 0xff
-; GFX8-NEXT: s_lshr_b32 s7, s2, 16
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX8-NEXT: s_lshl_b32 s6, s6, 8
-; GFX8-NEXT: s_or_b32 s2, s2, s6
-; GFX8-NEXT: s_and_b32 s6, s7, 0xff
+; GFX8-NEXT: s_lshl_b32 s7, s7, 8
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_or_b32 s6, s6, s7
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x80008
+; GFX8-NEXT: s_or_b32 s1, s6, s1
+; GFX8-NEXT: s_lshr_b32 s6, s2, 24
+; GFX8-NEXT: s_and_b32 s7, s2, 0xff
+; GFX8-NEXT: s_lshl_b32 s8, s8, 8
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX8-NEXT: v_not_b32_e32 v1, 23
-; GFX8-NEXT: s_lshr_b32 s9, s3, 8
-; GFX8-NEXT: s_lshl_b32 s6, s6, 16
-; GFX8-NEXT: s_and_b32 s3, s3, 0xff
+; GFX8-NEXT: s_or_b32 s7, s7, s8
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
-; GFX8-NEXT: s_or_b32 s2, s2, s6
-; GFX8-NEXT: s_lshl_b32 s3, s3, 8
-; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: s_or_b32 s3, s8, s3
-; GFX8-NEXT: s_lshl_b32 s6, s6, 16
-; GFX8-NEXT: s_or_b32 s3, s3, s6
-; GFX8-NEXT: s_lshr_b32 s6, s4, 8
-; GFX8-NEXT: s_and_b32 s6, s6, 0xff
+; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s2, s7, s2
+; GFX8-NEXT: s_and_b32 s7, s3, 0xff
+; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX8-NEXT: s_lshl_b32 s7, s7, 8
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_or_b32 s6, s6, s7
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
-; GFX8-NEXT: s_lshr_b32 s7, s4, 16
-; GFX8-NEXT: s_lshr_b32 s8, s4, 24
-; GFX8-NEXT: s_and_b32 s4, s4, 0xff
-; GFX8-NEXT: s_lshl_b32 s6, s6, 8
-; GFX8-NEXT: s_or_b32 s4, s4, s6
-; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: s_lshl_b32 s6, s6, 16
-; GFX8-NEXT: s_or_b32 s4, s4, s6
+; GFX8-NEXT: s_or_b32 s3, s6, s3
+; GFX8-NEXT: s_lshr_b32 s6, s4, 24
+; GFX8-NEXT: s_and_b32 s7, s4, 0xff
+; GFX8-NEXT: s_lshl_b32 s8, s8, 8
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX8-NEXT: s_or_b32 s7, s7, s8
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s4, s7, s4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
-; GFX8-NEXT: s_lshr_b32 s9, s5, 8
-; GFX8-NEXT: s_and_b32 s5, s5, 0xff
-; GFX8-NEXT: s_lshl_b32 s5, s5, 8
+; GFX8-NEXT: s_and_b32 s7, s5, 0xff
+; GFX8-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX8-NEXT: s_lshl_b32 s7, s7, 8
; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24
-; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: s_or_b32 s5, s8, s5
-; GFX8-NEXT: s_lshl_b32 s6, s6, 16
-; GFX8-NEXT: s_or_b32 s5, s5, s6
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_or_b32 s6, s6, s7
+; GFX8-NEXT: s_lshl_b32 s5, s5, 16
+; GFX8-NEXT: s_or_b32 s5, s6, s5
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0
@@ -1974,67 +1919,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
;
; GFX9-LABEL: s_fshl_v2i24:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s6, s0, 8
-; GFX9-NEXT: s_and_b32 s6, s6, 0xff
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX9-NEXT: s_lshr_b32 s7, s0, 16
-; GFX9-NEXT: s_lshr_b32 s8, s0, 24
-; GFX9-NEXT: s_and_b32 s0, s0, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_or_b32 s0, s0, s6
-; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: s_lshr_b32 s9, s1, 8
-; GFX9-NEXT: s_lshl_b32 s6, s6, 16
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_or_b32 s0, s0, s6
-; GFX9-NEXT: s_lshl_b32 s1, s1, 8
-; GFX9-NEXT: s_and_b32 s6, s9, 0xff
-; GFX9-NEXT: s_or_b32 s1, s8, s1
-; GFX9-NEXT: s_lshl_b32 s6, s6, 16
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008
+; GFX9-NEXT: s_lshr_b32 s6, s0, 24
+; GFX9-NEXT: s_and_b32 s7, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s8, 8
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT: s_or_b32 s1, s1, s6
-; GFX9-NEXT: s_lshr_b32 s6, s2, 8
+; GFX9-NEXT: s_or_b32 s0, s7, s0
+; GFX9-NEXT: s_and_b32 s7, s1, 0xff
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_and_b32 s6, s6, 0xff
-; GFX9-NEXT: s_lshr_b32 s7, s2, 16
-; GFX9-NEXT: s_lshr_b32 s8, s2, 24
-; GFX9-NEXT: s_and_b32 s2, s2, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_or_b32 s2, s2, s6
-; GFX9-NEXT: s_and_b32 s6, s7, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 8
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_lshl_b32 s1, s1, 16
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x80008
+; GFX9-NEXT: s_or_b32 s1, s6, s1
+; GFX9-NEXT: s_lshr_b32 s6, s2, 24
+; GFX9-NEXT: s_and_b32 s7, s2, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s8, 8
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX9-NEXT: v_not_b32_e32 v1, 23
-; GFX9-NEXT: s_lshr_b32 s9, s3, 8
-; GFX9-NEXT: s_lshl_b32 s6, s6, 16
-; GFX9-NEXT: s_and_b32 s3, s3, 0xff
+; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
-; GFX9-NEXT: s_or_b32 s2, s2, s6
-; GFX9-NEXT: s_lshl_b32 s3, s3, 8
-; GFX9-NEXT: s_and_b32 s6, s9, 0xff
-; GFX9-NEXT: s_or_b32 s3, s8, s3
-; GFX9-NEXT: s_lshl_b32 s6, s6, 16
-; GFX9-NEXT: s_or_b32 s3, s3, s6
-; GFX9-NEXT: s_lshr_b32 s6, s4, 8
-; GFX9-NEXT: s_and_b32 s6, s6, 0xff
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s2, s7, s2
+; GFX9-NEXT: s_and_b32 s7, s3, 0xff
+; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX9-NEXT: s_lshl_b32 s7, s7, 8
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_lshl_b32 s3, s3, 16
+; GFX9-NEXT: s_bfe_u32 s8, s4, 0x80008
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT: s_lshr_b32 s7, s4, 16
-; GFX9-NEXT: s_lshr_b32 s8, s4, 24
-; GFX9-NEXT: s_and_b32 s4, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_or_b32 s4, s4, s6
-; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 16
-; GFX9-NEXT: s_or_b32 s4, s4, s6
+; GFX9-NEXT: s_or_b32 s3, s6, s3
+; GFX9-NEXT: s_lshr_b32 s6, s4, 24
+; GFX9-NEXT: s_and_b32 s7, s4, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s8, 8
+; GFX9-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT: s_lshl_b32 s4, s4, 16
+; GFX9-NEXT: s_or_b32 s4, s7, s4
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX9-NEXT: s_lshr_b32 s9, s5, 8
-; GFX9-NEXT: s_and_b32 s5, s5, 0xff
-; GFX9-NEXT: s_lshl_b32 s5, s5, 8
+; GFX9-NEXT: s_and_b32 s7, s5, 0xff
+; GFX9-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX9-NEXT: s_lshl_b32 s7, s7, 8
; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24
-; GFX9-NEXT: s_and_b32 s6, s9, 0xff
-; GFX9-NEXT: s_or_b32 s5, s8, s5
-; GFX9-NEXT: s_lshl_b32 s6, s6, 16
-; GFX9-NEXT: s_or_b32 s5, s5, s6
+; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_lshl_b32 s5, s5, 16
+; GFX9-NEXT: s_or_b32 s5, s6, s5
; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1
@@ -2082,78 +2027,80 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-LABEL: s_fshl_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT: s_lshr_b32 s10, s4, 8
-; GFX10-NEXT: s_lshr_b32 s11, s4, 16
-; GFX10-NEXT: s_and_b32 s10, s10, 0xff
-; GFX10-NEXT: s_lshr_b32 s12, s4, 24
+; GFX10-NEXT: s_bfe_u32 s12, s4, 0x80008
+; GFX10-NEXT: s_lshr_b32 s10, s4, 24
+; GFX10-NEXT: s_and_b32 s11, s4, 0xff
+; GFX10-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
-; GFX10-NEXT: s_and_b32 s11, s11, 0xff
-; GFX10-NEXT: s_lshl_b32 s10, s10, 8
-; GFX10-NEXT: s_lshl_b32 s11, s11, 16
-; GFX10-NEXT: s_or_b32 s4, s4, s10
-; GFX10-NEXT: s_lshr_b32 s13, s5, 8
-; GFX10-NEXT: s_and_b32 s5, s5, 0xff
-; GFX10-NEXT: s_or_b32 s4, s4, s11
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
+; GFX10-NEXT: s_lshl_b32 s12, s12, 8
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_or_b32 s11, s11, s12
+; GFX10-NEXT: s_lshl_b32 s4, s4, 16
+; GFX10-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX10-NEXT: s_and_b32 s13, s5, 0xff
+; GFX10-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX10-NEXT: s_or_b32 s4, s11, s4
+; GFX10-NEXT: s_lshl_b32 s13, s13, 8
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT: s_and_b32 s13, s13, 0xff
-; GFX10-NEXT: s_or_b32 s5, s12, s5
-; GFX10-NEXT: s_lshl_b32 s10, s13, 16
-; GFX10-NEXT: s_lshr_b32 s9, s1, 8
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_or_b32 s10, s10, s13
+; GFX10-NEXT: s_lshl_b32 s5, s5, 16
+; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: s_or_b32 s5, s5, s10
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s10, s2, 8
-; GFX10-NEXT: s_lshr_b32 s8, s0, 24
+; GFX10-NEXT: s_or_b32 s5, s10, s5
+; GFX10-NEXT: s_bfe_u32 s12, s2, 0x80008
+; GFX10-NEXT: s_and_b32 s7, s0, 0xff
+; GFX10-NEXT: s_lshr_b32 s10, s2, 24
; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX10-NEXT: s_lshr_b32 s11, s2, 16
-; GFX10-NEXT: s_lshr_b32 s13, s3, 8
-; GFX10-NEXT: s_and_b32 s3, s3, 0xff
-; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s10, s10, 0xff
-; GFX10-NEXT: s_lshr_b32 s12, s2, 24
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
+; GFX10-NEXT: s_and_b32 s11, s2, 0xff
+; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT: s_and_b32 s13, s3, 0xff
+; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX10-NEXT: s_lshl_b32 s8, s8, 8
+; GFX10-NEXT: s_lshl_b32 s12, s12, 8
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
-; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: s_or_b32 s1, s8, s1
-; GFX10-NEXT: s_lshl_b32 s8, s10, 8
-; GFX10-NEXT: s_or_b32 s3, s12, s3
-; GFX10-NEXT: s_or_b32 s2, s2, s8
-; GFX10-NEXT: s_lshr_b32 s6, s0, 8
-; GFX10-NEXT: s_lshr_b32 s7, s0, 16
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: s_lshr_b32 s6, s0, 24
+; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT: s_and_b32 s9, s1, 0xff
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
-; GFX10-NEXT: s_and_b32 s0, s0, 0xff
-; GFX10-NEXT: s_and_b32 s7, s7, 0xff
-; GFX10-NEXT: s_and_b32 s9, s9, 0xff
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: s_lshl_b32 s9, s9, 8
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX10-NEXT: s_lshl_b32 s6, s6, 8
-; GFX10-NEXT: s_lshl_b32 s7, s7, 16
-; GFX10-NEXT: s_or_b32 s0, s0, s6
+; GFX10-NEXT: s_or_b32 s6, s6, s9
+; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-NEXT: s_or_b32 s0, s0, s7
+; GFX10-NEXT: s_or_b32 s1, s6, s1
; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT: s_and_b32 s4, s11, 0xff
-; GFX10-NEXT: s_and_b32 s5, s13, 0xff
-; GFX10-NEXT: s_lshl_b32 s4, s4, 16
+; GFX10-NEXT: s_lshl_b32 s4, s13, 8
+; GFX10-NEXT: s_or_b32 s5, s7, s8
+; GFX10-NEXT: s_or_b32 s7, s11, s12
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
-; GFX10-NEXT: s_lshl_b32 s5, s5, 16
-; GFX10-NEXT: s_or_b32 s2, s2, s4
-; GFX10-NEXT: s_or_b32 s3, s3, s5
+; GFX10-NEXT: s_or_b32 s4, s10, s4
+; GFX10-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT: s_or_b32 s3, s4, s3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT: s_or_b32 s2, s7, s2
; GFX10-NEXT: s_lshr_b32 s3, s3, 1
; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_or_b32 s0, s5, s0
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
@@ -2165,8 +2112,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2
; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
-; GFX10-NEXT: s_lshl_b32 s2, s9, 16
-; GFX10-NEXT: s_or_b32 s1, s1, s2
; GFX10-NEXT: v_lshl_or_b32 v2, s0, v2, v3
; GFX10-NEXT: v_lshrrev_b32_e64 v4, v4, s3
; GFX10-NEXT: v_mov_b32_e32 v3, 16
@@ -2187,10 +2132,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-LABEL: s_fshl_v2i24:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX11-NEXT: s_lshr_b32 s10, s4, 8
-; GFX11-NEXT: s_lshr_b32 s11, s4, 16
-; GFX11-NEXT: s_and_b32 s10, s10, 0xff
-; GFX11-NEXT: s_lshr_b32 s12, s4, 24
+; GFX11-NEXT: s_bfe_u32 s12, s4, 0x80008
+; GFX11-NEXT: s_lshr_b32 s10, s4, 24
+; GFX11-NEXT: s_and_b32 s11, s4, 0xff
+; GFX11-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_and_b32 s11, s11, 0xff
@@ -2203,104 +2148,102 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_and_b32 s13, s13, 0xff
-; GFX11-NEXT: s_or_b32 s5, s12, s5
-; GFX11-NEXT: s_lshl_b32 s10, s13, 16
-; GFX11-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_or_b32 s10, s10, s13
+; GFX11-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-NEXT: s_bfe_u32 s8, s0, 0x80008
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_or_b32 s5, s5, s10
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshr_b32 s10, s2, 8
-; GFX11-NEXT: s_lshr_b32 s8, s0, 24
+; GFX11-NEXT: s_or_b32 s5, s10, s5
+; GFX11-NEXT: s_bfe_u32 s12, s2, 0x80008
+; GFX11-NEXT: s_and_b32 s7, s0, 0xff
+; GFX11-NEXT: s_lshr_b32 s10, s2, 24
; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
-; GFX11-NEXT: s_lshr_b32 s11, s2, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s9, s9, 0xff
-; GFX11-NEXT: s_and_b32 s10, s10, 0xff
-; GFX11-NEXT: s_lshr_b32 s12, s2, 24
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_or_b32 s1, s8, s1
+; GFX11-NEXT: s_and_b32 s11, s2, 0xff
+; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX11-NEXT: s_lshl_b32 s8, s8, 8
+; GFX11-NEXT: s_lshl_b32 s12, s12, 8
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s13, s3, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT: s_lshl_b32 s8, s9, 16
-; GFX11-NEXT: s_lshl_b32 s9, s10, 8
-; GFX11-NEXT: s_lshr_b32 s6, s0, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s9
-; GFX11-NEXT: s_lshr_b32 s13, s3, 8
-; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_lshr_b32 s7, s0, 16
+; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX11-NEXT: s_lshr_b32 s6, s0, 24
+; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_lshl_b32 s0, s0, 16
; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_and_b32 s7, s7, 0xff
+; GFX11-NEXT: s_and_b32 s9, s1, 0xff
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX11-NEXT: s_lshl_b32 s9, s9, 8
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX11-NEXT: s_lshl_b32 s6, s6, 8
-; GFX11-NEXT: s_or_b32 s3, s12, s3
-; GFX11-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_or_b32 s0, s0, s7
-; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24
+; GFX11-NEXT: s_or_b32 s6, s6, s9
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24
; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX11-NEXT: s_and_b32 s4, s11, 0xff
-; GFX11-NEXT: s_and_b32 s5, s13, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: s_or_b32 s5, s7, s8
+; GFX11-NEXT: s_or_b32 s7, s11, s12
+; GFX11-NEXT: s_lshl_b32 s4, s13, 8
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
-; GFX11-NEXT: s_or_b32 s2, s2, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX11-NEXT: s_or_b32 s4, s10, s4
+; GFX11-NEXT: s_or_b32 s2, s7, s2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT: s_or_b32 s3, s3, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_or_b32 s3, s4, s3
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX11-NEXT: s_or_b32 s0, s5, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2
; GFX11-NEXT: s_lshr_b32 s2, s3, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2
-; GFX11-NEXT: s_or_b32 s0, s1, s8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_or_b32 s0, s6, s1
; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v3
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5
; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i48 %lhs.arg to <2 x i24>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 1e762f9a927f8..a68916135013a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -462,42 +462,17 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
}
define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshr_i8_4:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 4
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshr_i8_4:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 s1, s1, 4
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshr_i8_4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_lshr_b32 s1, s1, 4
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshr_i8_4:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 4
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshr_i8_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 4
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i8_4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -555,42 +530,17 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
}
define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshr_i8_5:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 3
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x30005
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshr_i8_5:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: s_lshr_b32 s1, s1, 5
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshr_i8_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 3
-; GFX9-NEXT: s_lshr_b32 s1, s1, 5
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshr_i8_5:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-NEXT: s_lshr_b32 s1, s1, 5
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshr_i8_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 3
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x30005
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i8_5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 3
-; GFX11-NEXT: s_lshr_b32 s1, s1, 5
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x30005
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -675,22 +625,22 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8-LABEL: s_fshr_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
-; GFX8-NEXT: s_lshr_b32 s4, s1, 8
-; GFX8-NEXT: s_lshr_b32 s5, s2, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_andn2_b32 s6, 7, s2
+; GFX8-NEXT: s_andn2_b32 s5, 7, s2
+; GFX8-NEXT: s_lshr_b32 s4, s2, 8
+; GFX8-NEXT: s_lshl_b32 s0, s0, s5
; GFX8-NEXT: s_and_b32 s2, s2, 7
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_lshl_b32 s1, s3, 1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s5
-; GFX8-NEXT: s_lshl_b32 s1, s1, s2
-; GFX8-NEXT: s_and_b32 s2, s5, 7
-; GFX8-NEXT: s_and_b32 s3, s4, 0xff
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_and_b32 s5, s1, 0xff
+; GFX8-NEXT: s_lshr_b32 s2, s5, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_lshl_b32 s2, s3, 1
+; GFX8-NEXT: s_andn2_b32 s3, 7, s4
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX8-NEXT: s_lshl_b32 s2, s2, s3
+; GFX8-NEXT: s_and_b32 s3, s4, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s1, s1, s3
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
@@ -700,22 +650,22 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9-LABEL: s_fshr_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
-; GFX9-NEXT: s_lshr_b32 s4, s1, 8
-; GFX9-NEXT: s_lshr_b32 s5, s2, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_andn2_b32 s6, 7, s2
+; GFX9-NEXT: s_andn2_b32 s5, 7, s2
+; GFX9-NEXT: s_lshr_b32 s4, s2, 8
+; GFX9-NEXT: s_lshl_b32 s0, s0, s5
; GFX9-NEXT: s_and_b32 s2, s2, 7
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, s6
-; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_lshl_b32 s1, s3, 1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s5
-; GFX9-NEXT: s_lshl_b32 s1, s1, s2
-; GFX9-NEXT: s_and_b32 s2, s5, 7
-; GFX9-NEXT: s_and_b32 s3, s4, 0xff
-; GFX9-NEXT: s_lshr_b32 s2, s3, s2
-; GFX9-NEXT: s_or_b32 s1, s1, s2
+; GFX9-NEXT: s_and_b32 s5, s1, 0xff
+; GFX9-NEXT: s_lshr_b32 s2, s5, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s2, s3, 1
+; GFX9-NEXT: s_andn2_b32 s3, 7, s4
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX9-NEXT: s_lshl_b32 s2, s2, s3
+; GFX9-NEXT: s_and_b32 s3, s4, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_lshr_b32 s1, s1, s3
+; GFX9-NEXT: s_or_b32 s1, s2, s1
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
@@ -725,23 +675,23 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX10-LABEL: s_fshr_v2i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s4, s1, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_andn2_b32 s5, 7, s2
-; GFX10-NEXT: s_lshr_b32 s6, s2, 8
-; GFX10-NEXT: s_lshl_b32 s0, s0, s5
+; GFX10-NEXT: s_andn2_b32 s4, 7, s2
+; GFX10-NEXT: s_lshr_b32 s5, s2, 8
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_and_b32 s4, s1, 0xff
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 1
-; GFX10-NEXT: s_andn2_b32 s5, 7, s6
-; GFX10-NEXT: s_and_b32 s6, s6, 7
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
+; GFX10-NEXT: s_andn2_b32 s6, 7, s5
+; GFX10-NEXT: s_and_b32 s5, s5, 7
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_and_b32 s2, s2, 7
+; GFX10-NEXT: s_lshl_b32 s3, s3, s6
+; GFX10-NEXT: s_lshr_b32 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s2, s4, s2
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s3, s3, s5
-; GFX10-NEXT: s_lshr_b32 s4, s4, s6
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_or_b32 s2, s3, s4
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s1, s2, 0xff
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -750,23 +700,23 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX11-LABEL: s_fshr_v2i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s2
-; GFX11-NEXT: s_lshr_b32 s6, s2, 8
-; GFX11-NEXT: s_lshl_b32 s0, s0, s5
+; GFX11-NEXT: s_and_not1_b32 s4, 7, s2
+; GFX11-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-NEXT: s_lshl_b32 s0, s0, s4
+; GFX11-NEXT: s_and_b32 s4, s1, 0xff
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX11-NEXT: s_lshl_b32 s3, s3, 1
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s6
-; GFX11-NEXT: s_and_b32 s6, s6, 7
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
+; GFX11-NEXT: s_and_not1_b32 s6, 7, s5
+; GFX11-NEXT: s_and_b32 s5, s5, 7
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_b32 s2, s2, 7
+; GFX11-NEXT: s_lshl_b32 s3, s3, s6
+; GFX11-NEXT: s_lshr_b32 s1, s1, s5
+; GFX11-NEXT: s_lshr_b32 s2, s4, s2
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_or_b32 s0, s0, s2
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s4, s4, s6
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_and_b32 s1, s2, 0xff
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -977,44 +927,44 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
-; GFX8-NEXT: s_lshr_b32 s6, s1, 8
-; GFX8-NEXT: s_lshr_b32 s7, s1, 16
-; GFX8-NEXT: s_lshr_b32 s8, s1, 24
-; GFX8-NEXT: s_lshr_b32 s9, s2, 8
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
-; GFX8-NEXT: s_lshr_b32 s11, s2, 24
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_andn2_b32 s12, 7, s2
+; GFX8-NEXT: s_andn2_b32 s10, 7, s2
+; GFX8-NEXT: s_lshr_b32 s7, s2, 8
+; GFX8-NEXT: s_lshr_b32 s8, s2, 16
+; GFX8-NEXT: s_lshr_b32 s9, s2, 24
+; GFX8-NEXT: s_lshl_b32 s0, s0, s10
; GFX8-NEXT: s_and_b32 s2, s2, 7
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, s12
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_lshl_b32 s1, s3, 1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s9
-; GFX8-NEXT: s_lshl_b32 s1, s1, s2
-; GFX8-NEXT: s_and_b32 s2, s9, 7
-; GFX8-NEXT: s_and_b32 s3, s6, 0xff
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_or_b32 s1, s1, s2
-; GFX8-NEXT: s_lshl_b32 s2, s4, 1
-; GFX8-NEXT: s_andn2_b32 s3, 7, s10
+; GFX8-NEXT: s_and_b32 s10, s1, 0xff
+; GFX8-NEXT: s_lshr_b32 s2, s10, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_lshl_b32 s2, s3, 1
+; GFX8-NEXT: s_andn2_b32 s3, 7, s7
; GFX8-NEXT: s_lshl_b32 s2, s2, s3
-; GFX8-NEXT: s_and_b32 s3, s10, 7
-; GFX8-NEXT: s_and_b32 s4, s7, 0xff
-; GFX8-NEXT: s_lshr_b32 s3, s4, s3
+; GFX8-NEXT: s_and_b32 s3, s7, 7
+; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80008
+; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT: s_lshr_b32 s3, s7, s3
+; GFX8-NEXT: s_lshr_b32 s6, s1, 24
; GFX8-NEXT: s_or_b32 s2, s2, s3
+; GFX8-NEXT: s_lshl_b32 s3, s4, 1
+; GFX8-NEXT: s_andn2_b32 s4, 7, s8
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX8-NEXT: s_lshl_b32 s3, s3, s4
+; GFX8-NEXT: s_and_b32 s4, s8, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s1, s1, s4
+; GFX8-NEXT: s_or_b32 s1, s3, s1
; GFX8-NEXT: s_lshl_b32 s3, s5, 1
-; GFX8-NEXT: s_andn2_b32 s4, 7, s11
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_andn2_b32 s4, 7, s9
; GFX8-NEXT: s_lshl_b32 s3, s3, s4
-; GFX8-NEXT: s_and_b32 s4, s11, 7
+; GFX8-NEXT: s_and_b32 s4, s9, 7
+; GFX8-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NEXT: s_lshr_b32 s4, s6, s4
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
-; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_lshr_b32 s4, s8, s4
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s2, 0xff
+; GFX8-NEXT: s_lshl_b32 s2, s2, 8
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_or_b32 s3, s3, s4
+; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s3, 0xff
@@ -1027,44 +977,44 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
-; GFX9-NEXT: s_lshr_b32 s6, s1, 8
-; GFX9-NEXT: s_lshr_b32 s7, s1, 16
-; GFX9-NEXT: s_lshr_b32 s8, s1, 24
-; GFX9-NEXT: s_lshr_b32 s9, s2, 8
-; GFX9-NEXT: s_lshr_b32 s10, s2, 16
-; GFX9-NEXT: s_lshr_b32 s11, s2, 24
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: s_andn2_b32 s12, 7, s2
+; GFX9-NEXT: s_andn2_b32 s10, 7, s2
+; GFX9-NEXT: s_lshr_b32 s7, s2, 8
+; GFX9-NEXT: s_lshr_b32 s8, s2, 16
+; GFX9-NEXT: s_lshr_b32 s9, s2, 24
+; GFX9-NEXT: s_lshl_b32 s0, s0, s10
; GFX9-NEXT: s_and_b32 s2, s2, 7
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, s12
-; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_lshl_b32 s1, s3, 1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s9
-; GFX9-NEXT: s_lshl_b32 s1, s1, s2
-; GFX9-NEXT: s_and_b32 s2, s9, 7
-; GFX9-NEXT: s_and_b32 s3, s6, 0xff
-; GFX9-NEXT: s_lshr_b32 s2, s3, s2
-; GFX9-NEXT: s_or_b32 s1, s1, s2
-; GFX9-NEXT: s_lshl_b32 s2, s4, 1
-; GFX9-NEXT: s_andn2_b32 s3, 7, s10
+; GFX9-NEXT: s_and_b32 s10, s1, 0xff
+; GFX9-NEXT: s_lshr_b32 s2, s10, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s2, s3, 1
+; GFX9-NEXT: s_andn2_b32 s3, 7, s7
; GFX9-NEXT: s_lshl_b32 s2, s2, s3
-; GFX9-NEXT: s_and_b32 s3, s10, 7
-; GFX9-NEXT: s_and_b32 s4, s7, 0xff
-; GFX9-NEXT: s_lshr_b32 s3, s4, s3
+; GFX9-NEXT: s_and_b32 s3, s7, 7
+; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT: s_lshr_b32 s3, s7, s3
+; GFX9-NEXT: s_lshr_b32 s6, s1, 24
; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: s_lshl_b32 s3, s4, 1
+; GFX9-NEXT: s_andn2_b32 s4, 7, s8
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT: s_lshl_b32 s3, s3, s4
+; GFX9-NEXT: s_and_b32 s4, s8, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_lshr_b32 s1, s1, s4
+; GFX9-NEXT: s_or_b32 s1, s3, s1
; GFX9-NEXT: s_lshl_b32 s3, s5, 1
-; GFX9-NEXT: s_andn2_b32 s4, 7, s11
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_andn2_b32 s4, 7, s9
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
-; GFX9-NEXT: s_and_b32 s4, s11, 7
+; GFX9-NEXT: s_and_b32 s4, s9, 7
+; GFX9-NEXT: s_and_b32 s2, s2, 0xff
+; GFX9-NEXT: s_lshr_b32 s4, s6, s4
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
-; GFX9-NEXT: s_lshl_b32 s1, s1, 8
-; GFX9-NEXT: s_lshr_b32 s4, s8, s4
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s2, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 8
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_or_b32 s3, s3, s4
+; GFX9-NEXT: s_or_b32 s0, s0, s2
; GFX9-NEXT: s_lshl_b32 s1, s1, 16
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s3, 0xff
@@ -1074,48 +1024,48 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX10-LABEL: s_fshr_v4i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s6, s1, 8
-; GFX10-NEXT: s_lshr_b32 s7, s1, 16
-; GFX10-NEXT: s_lshr_b32 s8, s1, 24
-; GFX10-NEXT: s_lshr_b32 s9, s2, 8
-; GFX10-NEXT: s_lshr_b32 s10, s2, 16
-; GFX10-NEXT: s_lshr_b32 s11, s2, 24
-; GFX10-NEXT: s_andn2_b32 s12, 7, s2
+; GFX10-NEXT: s_lshr_b32 s7, s2, 8
+; GFX10-NEXT: s_lshr_b32 s8, s2, 16
+; GFX10-NEXT: s_lshr_b32 s9, s2, 24
+; GFX10-NEXT: s_andn2_b32 s10, 7, s2
; GFX10-NEXT: s_and_b32 s2, s2, 7
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_and_b32 s11, s1, 0xff
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_lshl_b32 s2, s3, 1
-; GFX10-NEXT: s_andn2_b32 s3, 7, s9
-; GFX10-NEXT: s_and_b32 s9, s9, 7
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s2, s2, s3
-; GFX10-NEXT: s_lshr_b32 s3, s6, s9
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s2, s3
-; GFX10-NEXT: s_lshl_b32 s2, s4, 1
-; GFX10-NEXT: s_andn2_b32 s3, 7, s10
-; GFX10-NEXT: s_and_b32 s4, s10, 7
-; GFX10-NEXT: s_and_b32 s6, s7, 0xff
-; GFX10-NEXT: s_lshl_b32 s2, s2, s3
-; GFX10-NEXT: s_lshr_b32 s3, s6, s4
+; GFX10-NEXT: s_lshr_b32 s2, s11, s2
+; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008
+; GFX10-NEXT: s_lshl_b32 s0, s0, s10
+; GFX10-NEXT: s_lshl_b32 s3, s3, 1
+; GFX10-NEXT: s_andn2_b32 s10, 7, s7
+; GFX10-NEXT: s_and_b32 s7, s7, 7
+; GFX10-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX10-NEXT: s_lshr_b32 s6, s1, 24
+; GFX10-NEXT: s_lshl_b32 s3, s3, s10
+; GFX10-NEXT: s_lshr_b32 s7, s11, s7
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s2, s3, s7
+; GFX10-NEXT: s_lshl_b32 s3, s4, 1
+; GFX10-NEXT: s_andn2_b32 s4, 7, s8
+; GFX10-NEXT: s_and_b32 s7, s8, 7
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_lshl_b32 s3, s3, s4
+; GFX10-NEXT: s_lshr_b32 s1, s1, s7
; GFX10-NEXT: s_lshl_b32 s4, s5, 1
-; GFX10-NEXT: s_andn2_b32 s5, 7, s11
-; GFX10-NEXT: s_and_b32 s6, s11, 7
+; GFX10-NEXT: s_andn2_b32 s5, 7, s9
+; GFX10-NEXT: s_and_b32 s7, s9, 7
; GFX10-NEXT: s_lshl_b32 s4, s4, s5
-; GFX10-NEXT: s_lshr_b32 s5, s8, s6
-; GFX10-NEXT: s_or_b32 s2, s2, s3
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_lshr_b32 s5, s6, s7
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
-; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_lshl_b32 s1, s2, 16
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: s_and_b32 s2, s3, 0xff
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: s_lshl_b32 s1, s2, 24
@@ -1124,48 +1074,48 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX11-LABEL: s_fshr_v4i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: s_lshr_b32 s7, s1, 16
-; GFX11-NEXT: s_lshr_b32 s8, s1, 24
-; GFX11-NEXT: s_lshr_b32 s9, s2, 8
-; GFX11-NEXT: s_lshr_b32 s10, s2, 16
-; GFX11-NEXT: s_lshr_b32 s11, s2, 24
-; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
+; GFX11-NEXT: s_lshr_b32 s7, s2, 8
+; GFX11-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-NEXT: s_lshr_b32 s9, s2, 24
+; GFX11-NEXT: s_and_not1_b32 s10, 7, s2
; GFX11-NEXT: s_and_b32 s2, s2, 7
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_and_b32 s11, s1, 0xff
+; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_lshl_b32 s2, s3, 1
-; GFX11-NEXT: s_and_not1_b32 s3, 7, s9
-; GFX11-NEXT: s_and_b32 s9, s9, 7
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s2, s2, s3
-; GFX11-NEXT: s_lshr_b32 s3, s6, s9
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s4, 1
-; GFX11-NEXT: s_and_not1_b32 s3, 7, s10
-; GFX11-NEXT: s_and_b32 s4, s10, 7
-; GFX11-NEXT: s_and_b32 s6, s7, 0xff
-; GFX11-NEXT: s_lshl_b32 s2, s2, s3
-; GFX11-NEXT: s_lshr_b32 s3, s6, s4
+; GFX11-NEXT: s_lshr_b32 s2, s11, s2
+; GFX11-NEXT: s_bfe_u32 s11, s1, 0x80008
+; GFX11-NEXT: s_lshl_b32 s0, s0, s10
+; GFX11-NEXT: s_lshl_b32 s3, s3, 1
+; GFX11-NEXT: s_and_not1_b32 s10, 7, s7
+; GFX11-NEXT: s_and_b32 s7, s7, 7
+; GFX11-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX11-NEXT: s_lshr_b32 s6, s1, 24
+; GFX11-NEXT: s_lshl_b32 s3, s3, s10
+; GFX11-NEXT: s_lshr_b32 s7, s11, s7
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s2, s3, s7
+; GFX11-NEXT: s_lshl_b32 s3, s4, 1
+; GFX11-NEXT: s_and_not1_b32 s4, 7, s8
+; GFX11-NEXT: s_and_b32 s7, s8, 7
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_lshl_b32 s3, s3, s4
+; GFX11-NEXT: s_lshr_b32 s1, s1, s7
; GFX11-NEXT: s_lshl_b32 s4, s5, 1
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s11
-; GFX11-NEXT: s_and_b32 s6, s11, 7
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s9
+; GFX11-NEXT: s_and_b32 s7, s9, 7
; GFX11-NEXT: s_lshl_b32 s4, s4, s5
-; GFX11-NEXT: s_lshr_b32 s5, s8, s6
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_lshr_b32 s5, s6, s7
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: s_or_b32 s3, s4, s5
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s2, 16
+; GFX11-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-NEXT: s_and_b32 s2, s3, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_lshl_b32 s1, s2, 24
@@ -1778,54 +1728,53 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6: ; %bb.0:
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
-; GFX6-NEXT: v_not_b32_e32 v3, 23
-; GFX6-NEXT: s_lshr_b32 s7, s1, 8
+; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80008
+; GFX6-NEXT: s_and_b32 s8, s1, 0xff
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX6-NEXT: s_and_b32 s6, s0, 0xff
+; GFX6-NEXT: s_lshl_b32 s7, s7, 8
+; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 24
+; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008
; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT: s_and_b32 s8, s0, 0xff
-; GFX6-NEXT: s_lshl_b32 s9, s9, 8
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008
+; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_bfe_u32 s7, s0, 0x80010
+; GFX6-NEXT: s_bfe_u32 s0, s1, 0x80008
+; GFX6-NEXT: s_and_b32 s1, s2, 0xff
+; GFX6-NEXT: s_lshl_b32 s8, s8, 8
+; GFX6-NEXT: s_or_b32 s1, s1, s8
+; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80010
+; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
+; GFX6-NEXT: v_not_b32_e32 v3, 23
+; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT: s_lshl_b32 s8, s8, 16
; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
-; GFX6-NEXT: s_or_b32 s8, s8, s9
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_lshr_b32 s1, s2, 16
-; GFX6-NEXT: s_and_b32 s9, s2, 0xff
-; GFX6-NEXT: s_lshl_b32 s10, s10, 8
-; GFX6-NEXT: s_lshr_b32 s6, s0, 16
-; GFX6-NEXT: s_and_b32 s0, s7, 0xff
-; GFX6-NEXT: s_lshr_b32 s7, s3, 8
-; GFX6-NEXT: s_or_b32 s9, s9, s10
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: s_and_b32 s3, s3, 0xff
+; GFX6-NEXT: s_or_b32 s1, s1, s8
+; GFX6-NEXT: s_and_b32 s8, s3, 0xff
; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: s_and_b32 s9, 0xffff, s9
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s2, s7, 0xff
-; GFX6-NEXT: s_or_b32 s1, s9, s1
+; GFX6-NEXT: s_bfe_u32 s2, s3, 0x80008
+; GFX6-NEXT: v_alignbit_b32 v1, s8, v1, 24
+; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
-; GFX6-NEXT: s_bfe_u32 s9, s4, 0x80008
-; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
+; GFX6-NEXT: s_bfe_u32 s3, s4, 0x80008
; GFX6-NEXT: v_or_b32_e32 v1, s2, v1
-; GFX6-NEXT: s_lshr_b32 s2, s4, 16
-; GFX6-NEXT: s_and_b32 s7, s4, 0xff
-; GFX6-NEXT: s_lshl_b32 s9, s9, 8
-; GFX6-NEXT: s_or_b32 s7, s7, s9
-; GFX6-NEXT: s_and_b32 s2, s2, 0xff
-; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
-; GFX6-NEXT: s_lshl_b32 s2, s2, 16
-; GFX6-NEXT: s_or_b32 s2, s7, s2
+; GFX6-NEXT: s_and_b32 s2, s4, 0xff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 8
+; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_bfe_u32 s3, s4, 0x80010
+; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2
-; GFX6-NEXT: s_lshr_b32 s3, s5, 8
-; GFX6-NEXT: s_and_b32 s5, s5, 0xff
+; GFX6-NEXT: s_and_b32 s3, s5, 0xff
; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24
-; GFX6-NEXT: s_and_b32 s3, s3, 0xff
+; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24
+; GFX6-NEXT: s_bfe_u32 s3, s5, 0x80008
+; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
@@ -1844,13 +1793,14 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT: s_and_b32 s6, s6, 0xff
-; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
+; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3
-; GFX6-NEXT: s_lshl_b32 s2, s6, 17
-; GFX6-NEXT: s_lshl_b32 s3, s8, 1
+; GFX6-NEXT: s_lshl_b32 s2, s7, 17
+; GFX6-NEXT: s_lshl_b32 s3, s6, 1
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6
; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4
@@ -1889,74 +1839,72 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8: ; %bb.0:
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_lshr_b32 s9, s1, 8
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s6, s0, 8
-; GFX8-NEXT: s_lshr_b32 s8, s0, 24
-; GFX8-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80008
+; GFX8-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NEXT: s_lshl_b32 s8, s8, 8
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX8-NEXT: s_and_b32 s6, s6, 0xff
-; GFX8-NEXT: s_or_b32 s1, s8, s1
-; GFX8-NEXT: s_lshr_b32 s8, s2, 8
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_lshr_b32 s7, s0, 16
-; GFX8-NEXT: s_and_b32 s0, s0, 0xff
-; GFX8-NEXT: s_lshl_b32 s6, s6, 8
-; GFX8-NEXT: s_and_b32 s8, s8, 0xff
-; GFX8-NEXT: s_or_b32 s0, s0, s6
-; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: s_and_b32 s7, s9, 0xff
-; GFX8-NEXT: s_lshr_b32 s9, s2, 16
-; GFX8-NEXT: s_lshr_b32 s10, s2, 24
-; GFX8-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NEXT: s_or_b32 s7, s7, s8
+; GFX8-NEXT: s_and_b32 s8, s1, 0xff
+; GFX8-NEXT: s_lshr_b32 s6, s0, 24
; GFX8-NEXT: s_lshl_b32 s8, s8, 8
-; GFX8-NEXT: s_or_b32 s2, s2, s8
-; GFX8-NEXT: s_and_b32 s8, s9, 0xff
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x80008
+; GFX8-NEXT: s_or_b32 s6, s6, s8
+; GFX8-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NEXT: s_and_b32 s9, s2, 0xff
+; GFX8-NEXT: s_lshl_b32 s10, s10, 8
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX8-NEXT: v_not_b32_e32 v1, 23
-; GFX8-NEXT: s_lshr_b32 s11, s3, 8
-; GFX8-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NEXT: s_and_b32 s3, s3, 0xff
+; GFX8-NEXT: s_or_b32 s9, s9, s10
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
-; GFX8-NEXT: s_or_b32 s2, s2, s8
-; GFX8-NEXT: s_lshl_b32 s3, s3, 8
-; GFX8-NEXT: s_and_b32 s8, s11, 0xff
-; GFX8-NEXT: s_or_b32 s3, s10, s3
-; GFX8-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NEXT: s_or_b32 s3, s3, s8
-; GFX8-NEXT: s_lshr_b32 s8, s4, 8
-; GFX8-NEXT: s_and_b32 s8, s8, 0xff
+; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s2, s9, s2
+; GFX8-NEXT: s_and_b32 s9, s3, 0xff
+; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX8-NEXT: s_lshl_b32 s9, s9, 8
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_or_b32 s8, s8, s9
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80008
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
-; GFX8-NEXT: s_lshr_b32 s9, s4, 16
-; GFX8-NEXT: s_lshr_b32 s10, s4, 24
-; GFX8-NEXT: s_and_b32 s4, s4, 0xff
-; GFX8-NEXT: s_lshl_b32 s8, s8, 8
-; GFX8-NEXT: s_or_b32 s4, s4, s8
-; GFX8-NEXT: s_and_b32 s8, s9, 0xff
-; GFX8-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NEXT: s_or_b32 s4, s4, s8
+; GFX8-NEXT: s_or_b32 s3, s8, s3
+; GFX8-NEXT: s_lshr_b32 s8, s4, 24
+; GFX8-NEXT: s_and_b32 s9, s4, 0xff
+; GFX8-NEXT: s_lshl_b32 s10, s10, 8
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX8-NEXT: s_or_b32 s9, s9, s10
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s4, s9, s4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
-; GFX8-NEXT: s_lshr_b32 s11, s5, 8
-; GFX8-NEXT: s_and_b32 s5, s5, 0xff
-; GFX8-NEXT: s_lshl_b32 s5, s5, 8
+; GFX8-NEXT: s_and_b32 s9, s5, 0xff
+; GFX8-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX8-NEXT: s_lshl_b32 s9, s9, 8
; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24
-; GFX8-NEXT: s_and_b32 s8, s11, 0xff
-; GFX8-NEXT: s_or_b32 s5, s10, s5
-; GFX8-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NEXT: s_or_b32 s5, s5, s8
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_or_b32 s8, s8, s9
+; GFX8-NEXT: s_lshl_b32 s5, s5, 16
+; GFX8-NEXT: s_or_b32 s5, s8, s5
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2
-; GFX8-NEXT: s_lshl_b32 s4, s6, 17
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_or_b32 s0, s4, s0
+; GFX8-NEXT: s_lshl_b32 s0, s0, 17
+; GFX8-NEXT: s_lshl_b32 s4, s7, 1
+; GFX8-NEXT: s_or_b32 s0, s0, s4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0
@@ -1967,11 +1915,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0
-; GFX8-NEXT: s_lshl_b32 s0, s7, 17
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
+; GFX8-NEXT: s_lshl_b32 s0, s1, 17
+; GFX8-NEXT: s_lshl_b32 s1, s6, 1
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0
@@ -1997,75 +1947,73 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_lshr_b32 s9, s1, 8
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshr_b32 s6, s0, 8
-; GFX9-NEXT: s_lshr_b32 s8, s0, 24
-; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008
+; GFX9-NEXT: s_and_b32 s7, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s8, s8, 8
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT: s_and_b32 s6, s6, 0xff
-; GFX9-NEXT: s_or_b32 s1, s8, s1
-; GFX9-NEXT: s_lshr_b32 s8, s2, 8
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_lshr_b32 s7, s0, 16
-; GFX9-NEXT: s_and_b32 s0, s0, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_and_b32 s8, s8, 0xff
-; GFX9-NEXT: s_or_b32 s0, s0, s6
-; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: s_and_b32 s7, s9, 0xff
-; GFX9-NEXT: s_lshr_b32 s9, s2, 16
-; GFX9-NEXT: s_lshr_b32 s10, s2, 24
-; GFX9-NEXT: s_and_b32 s2, s2, 0xff
+; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s8, s1, 0xff
+; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: s_lshl_b32 s8, s8, 8
-; GFX9-NEXT: s_or_b32 s2, s2, s8
-; GFX9-NEXT: s_and_b32 s8, s9, 0xff
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x80008
+; GFX9-NEXT: s_or_b32 s6, s6, s8
+; GFX9-NEXT: s_lshr_b32 s8, s2, 24
+; GFX9-NEXT: s_and_b32 s9, s2, 0xff
+; GFX9-NEXT: s_lshl_b32 s10, s10, 8
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX9-NEXT: v_not_b32_e32 v1, 23
-; GFX9-NEXT: s_lshr_b32 s11, s3, 8
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_and_b32 s3, s3, 0xff
+; GFX9-NEXT: s_or_b32 s9, s9, s10
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
-; GFX9-NEXT: s_or_b32 s2, s2, s8
-; GFX9-NEXT: s_lshl_b32 s3, s3, 8
-; GFX9-NEXT: s_and_b32 s8, s11, 0xff
-; GFX9-NEXT: s_or_b32 s3, s10, s3
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s3, s3, s8
-; GFX9-NEXT: s_lshr_b32 s8, s4, 8
-; GFX9-NEXT: s_and_b32 s8, s8, 0xff
+; GFX9-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s2, s9, s2
+; GFX9-NEXT: s_and_b32 s9, s3, 0xff
+; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX9-NEXT: s_lshl_b32 s9, s9, 8
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_or_b32 s8, s8, s9
+; GFX9-NEXT: s_lshl_b32 s3, s3, 16
+; GFX9-NEXT: s_bfe_u32 s10, s4, 0x80008
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT: s_lshr_b32 s9, s4, 16
-; GFX9-NEXT: s_lshr_b32 s10, s4, 24
-; GFX9-NEXT: s_and_b32 s4, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 8
-; GFX9-NEXT: s_or_b32 s4, s4, s8
-; GFX9-NEXT: s_and_b32 s8, s9, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s4, s4, s8
+; GFX9-NEXT: s_or_b32 s3, s8, s3
+; GFX9-NEXT: s_lshr_b32 s8, s4, 24
+; GFX9-NEXT: s_and_b32 s9, s4, 0xff
+; GFX9-NEXT: s_lshl_b32 s10, s10, 8
+; GFX9-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX9-NEXT: s_or_b32 s9, s9, s10
+; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX9-NEXT: s_lshl_b32 s4, s4, 16
+; GFX9-NEXT: s_or_b32 s4, s9, s4
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX9-NEXT: s_lshr_b32 s11, s5, 8
-; GFX9-NEXT: s_and_b32 s5, s5, 0xff
-; GFX9-NEXT: s_lshl_b32 s5, s5, 8
-; GFX9-NEXT: s_and_b32 s8, s11, 0xff
-; GFX9-NEXT: s_or_b32 s5, s10, s5
+; GFX9-NEXT: s_and_b32 s9, s5, 0xff
+; GFX9-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX9-NEXT: s_lshl_b32 s9, s9, 8
+; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX9-NEXT: s_or_b32 s8, s8, s9
; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s5, s5, s8
+; GFX9-NEXT: s_lshl_b32 s5, s5, 16
+; GFX9-NEXT: s_or_b32 s5, s8, s5
; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1
-; GFX9-NEXT: s_lshl_b32 s4, s6, 17
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_lshl_b32 s0, s0, 17
+; GFX9-NEXT: s_lshl_b32 s4, s7, 1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1
-; GFX9-NEXT: s_or_b32 s0, s4, s0
+; GFX9-NEXT: s_or_b32 s0, s0, s4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s2
; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0
@@ -2073,12 +2021,14 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0
-; GFX9-NEXT: s_lshl_b32 s0, s7, 17
-; GFX9-NEXT: s_lshl_b32 s1, s1, 1
+; GFX9-NEXT: s_lshl_b32 s0, s1, 17
+; GFX9-NEXT: s_lshl_b32 s1, s6, 1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2
@@ -2103,77 +2053,75 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-LABEL: s_fshr_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT: s_lshr_b32 s13, s4, 8
-; GFX10-NEXT: s_lshr_b32 s14, s4, 16
-; GFX10-NEXT: s_and_b32 s13, s13, 0xff
-; GFX10-NEXT: s_lshr_b32 s15, s4, 24
+; GFX10-NEXT: s_bfe_u32 s15, s4, 0x80008
+; GFX10-NEXT: s_lshr_b32 s13, s4, 24
+; GFX10-NEXT: s_and_b32 s14, s4, 0xff
+; GFX10-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
-; GFX10-NEXT: s_and_b32 s14, s14, 0xff
-; GFX10-NEXT: s_lshl_b32 s13, s13, 8
-; GFX10-NEXT: s_lshl_b32 s14, s14, 16
-; GFX10-NEXT: s_or_b32 s4, s4, s13
-; GFX10-NEXT: s_lshr_b32 s16, s5, 8
-; GFX10-NEXT: s_and_b32 s5, s5, 0xff
-; GFX10-NEXT: s_or_b32 s4, s4, s14
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
+; GFX10-NEXT: s_lshl_b32 s15, s15, 8
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_or_b32 s14, s14, s15
+; GFX10-NEXT: s_lshl_b32 s4, s4, 16
+; GFX10-NEXT: s_and_b32 s14, 0xffff, s14
+; GFX10-NEXT: s_and_b32 s16, s5, 0xff
+; GFX10-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX10-NEXT: s_or_b32 s4, s14, s4
+; GFX10-NEXT: s_lshl_b32 s16, s16, 8
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT: s_and_b32 s16, s16, 0xff
-; GFX10-NEXT: s_or_b32 s5, s15, s5
-; GFX10-NEXT: s_lshl_b32 s13, s16, 16
-; GFX10-NEXT: s_lshr_b32 s10, s2, 8
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_or_b32 s13, s13, s16
+; GFX10-NEXT: s_lshl_b32 s5, s5, 16
+; GFX10-NEXT: s_bfe_u32 s12, s2, 0x80008
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: s_or_b32 s5, s5, s13
-; GFX10-NEXT: s_lshr_b32 s9, s1, 8
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s11, s2, 16
+; GFX10-NEXT: s_or_b32 s5, s13, s5
+; GFX10-NEXT: s_and_b32 s9, s1, 0xff
+; GFX10-NEXT: s_lshr_b32 s10, s2, 24
+; GFX10-NEXT: s_and_b32 s11, s2, 0xff
; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX10-NEXT: s_lshr_b32 s13, s3, 8
-; GFX10-NEXT: s_and_b32 s3, s3, 0xff
-; GFX10-NEXT: s_and_b32 s10, s10, 0xff
-; GFX10-NEXT: s_lshr_b32 s6, s0, 8
-; GFX10-NEXT: s_lshr_b32 s8, s0, 24
-; GFX10-NEXT: s_lshr_b32 s12, s2, 24
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
+; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX10-NEXT: s_and_b32 s13, s3, 0xff
+; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX10-NEXT: s_lshl_b32 s12, s12, 8
+; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008
+; GFX10-NEXT: s_lshr_b32 s6, s0, 24
+; GFX10-NEXT: s_lshl_b32 s9, s9, 8
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
-; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s11, s11, 0xff
-; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: s_and_b32 s13, s13, 0xff
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
-; GFX10-NEXT: s_or_b32 s1, s8, s1
-; GFX10-NEXT: s_or_b32 s3, s12, s3
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s13, s13, 8
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT: s_and_b32 s7, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s8, s8, 8
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NEXT: s_lshl_b32 s8, s13, 16
-; GFX10-NEXT: s_lshr_b32 s7, s0, 16
-; GFX10-NEXT: s_and_b32 s0, s0, 0xff
-; GFX10-NEXT: s_lshl_b32 s6, s6, 8
+; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX10-NEXT: s_or_b32 s7, s7, s8
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX10-NEXT: s_or_b32 s3, s3, s8
-; GFX10-NEXT: s_and_b32 s7, s7, 0xff
-; GFX10-NEXT: s_and_b32 s9, s9, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s6
-; GFX10-NEXT: s_lshl_b32 s7, s7, 17
-; GFX10-NEXT: s_lshl_b32 s9, s9, 17
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT: s_lshl_b32 s0, s0, 17
+; GFX10-NEXT: s_lshl_b32 s1, s1, 17
+; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_lshl_b32 s1, s1, 1
-; GFX10-NEXT: s_or_b32 s0, s7, s0
-; GFX10-NEXT: s_or_b32 s1, s9, s1
-; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT: s_lshl_b32 s4, s10, 8
-; GFX10-NEXT: s_lshl_b32 s5, s11, 16
-; GFX10-NEXT: s_or_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s5, s11, s12
+; GFX10-NEXT: s_or_b32 s4, s6, s9
+; GFX10-NEXT: s_or_b32 s6, s10, s13
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0
-; GFX10-NEXT: s_or_b32 s2, s2, s5
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_or_b32 s3, s6, s3
+; GFX10-NEXT: s_or_b32 s2, s5, s2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT: s_lshl_b32 s5, s7, 1
+; GFX10-NEXT: s_or_b32 s0, s0, s5
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2
@@ -2187,8 +2135,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2
+; GFX10-NEXT: s_lshl_b32 s2, s4, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s3
+; GFX10-NEXT: s_or_b32 s1, s1, s2
; GFX10-NEXT: v_lshl_or_b32 v2, s0, v3, v2
; GFX10-NEXT: v_mov_b32_e32 v3, 16
; GFX10-NEXT: v_lshl_or_b32 v0, s1, v4, v0
@@ -2208,10 +2158,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-LABEL: s_fshr_v2i24:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
-; GFX11-NEXT: s_lshr_b32 s14, s4, 8
-; GFX11-NEXT: s_lshr_b32 s15, s4, 16
-; GFX11-NEXT: s_and_b32 s14, s14, 0xff
-; GFX11-NEXT: s_lshr_b32 s16, s4, 24
+; GFX11-NEXT: s_bfe_u32 s15, s4, 0x80008
+; GFX11-NEXT: s_lshr_b32 s13, s4, 24
+; GFX11-NEXT: s_and_b32 s14, s4, 0xff
+; GFX11-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_and_b32 s15, s15, 0xff
@@ -2224,62 +2174,63 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_and_b32 s17, s17, 0xff
-; GFX11-NEXT: s_or_b32 s5, s16, s5
-; GFX11-NEXT: s_lshl_b32 s14, s17, 16
-; GFX11-NEXT: s_lshr_b32 s10, s2, 8
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_or_b32 s13, s13, s16
+; GFX11-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-NEXT: s_and_b32 s9, s1, 0xff
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_or_b32 s5, s5, s14
-; GFX11-NEXT: s_lshr_b32 s11, s2, 16
-; GFX11-NEXT: s_and_b32 s10, s10, 0xff
-; GFX11-NEXT: s_lshr_b32 s6, s0, 8
+; GFX11-NEXT: s_or_b32 s5, s13, s5
+; GFX11-NEXT: s_bfe_u32 s12, s2, 0x80008
+; GFX11-NEXT: s_lshr_b32 s6, s0, 24
+; GFX11-NEXT: s_lshr_b32 s10, s2, 24
; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0
-; GFX11-NEXT: s_lshr_b32 s12, s2, 24
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_and_b32 s11, s11, 0xff
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_lshr_b32 s7, s0, 16
-; GFX11-NEXT: s_lshr_b32 s8, s0, 24
-; GFX11-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-NEXT: s_and_b32 s11, s2, 0xff
+; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX11-NEXT: s_lshl_b32 s9, s9, 8
+; GFX11-NEXT: s_lshl_b32 s12, s12, 8
+; GFX11-NEXT: s_bfe_u32 s8, s0, 0x80008
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s7, s0, 0xff
; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshr_b32 s13, s3, 8
-; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s6, 8
-; GFX11-NEXT: s_and_b32 s7, s7, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s13, s3, 0xff
+; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80008
+; GFX11-NEXT: s_lshl_b32 s8, s8, 8
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX11-NEXT: s_lshl_b32 s13, s13, 8
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_and_b32 s13, s13, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s6
-; GFX11-NEXT: s_or_b32 s1, s8, s1
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_and_b32 s7, 0xffff, s7
; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX11-NEXT: s_or_b32 s3, s12, s3
-; GFX11-NEXT: s_lshl_b32 s8, s13, 16
-; GFX11-NEXT: s_lshl_b32 s7, s7, 17
-; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_or_b32 s3, s3, s8
-; GFX11-NEXT: s_or_b32 s0, s7, s0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 17
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_lshl_b32 s1, s1, 17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24
; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT: s_and_b32 s9, s9, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 1
-; GFX11-NEXT: s_lshl_b32 s9, s9, 17
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0
-; GFX11-NEXT: s_lshl_b32 s4, s10, 8
-; GFX11-NEXT: s_lshl_b32 s5, s11, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s4
+; GFX11-NEXT: s_or_b32 s4, s6, s9
+; GFX11-NEXT: s_or_b32 s6, s11, s12
+; GFX11-NEXT: s_or_b32 s5, s10, s13
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0
-; GFX11-NEXT: s_or_b32 s2, s2, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_or_b32 s3, s5, s3
+; GFX11-NEXT: s_or_b32 s2, s6, s2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT: s_lshl_b32 s5, s7, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_or_b32 s0, s0, s5
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1
@@ -2292,34 +2243,35 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_lshl_b32 s2, s4, 1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1
-; GFX11-NEXT: s_or_b32 s0, s9, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_or_b32 s0, s1, s2
; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v0, s0, v3, v0
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0
; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5
-; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i48 %lhs.arg to <2 x i24>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index d16dc348209e2..7bb9ba1f184da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -525,10 +525,8 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
;
; GFX8-LABEL: abs_sgpr_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: s_sext_i32_i16 s1, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_abs_i32 s0, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -612,11 +610,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
;
; GFX8-LABEL: abs_sgpr_v3i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_abs_i32 s2, s2
+; GFX8-NEXT: s_sext_i32_i16 s2, s0
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_abs_i32 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_abs_i32 s1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 2f956d7a0a534..14332dfeaabd8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -2833,28 +2833,27 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
;
; GFX8-LABEL: s_saddsat_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_max_i32 s5, s4, 0
-; GFX8-NEXT: s_min_i32 s4, s4, 0
-; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
+; GFX8-NEXT: s_sext_i32_i16 s3, s0
+; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_sext_i32_i16 s5, s1
+; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
+; GFX8-NEXT: s_max_i32 s3, s3, s5
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
-; GFX8-NEXT: s_max_i32 s1, s4, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s4, s5
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_min_i32 s1, s1, s4
-; GFX8-NEXT: s_add_i32 s0, s0, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s2
-; GFX8-NEXT: s_max_i32 s4, s1, 0
-; GFX8-NEXT: s_min_i32 s1, s1, 0
-; GFX8-NEXT: s_sub_i32 s1, 0x8000, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
+; GFX8-NEXT: s_min_i32 s3, s3, s4
+; GFX8-NEXT: s_add_i32 s0, s0, s3
+; GFX8-NEXT: s_sext_i32_i16 s3, s2
+; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_min_i32 s3, s3, 0
+; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
-; GFX8-NEXT: s_max_i32 s1, s1, s3
+; GFX8-NEXT: s_max_i32 s1, s3, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
; GFX8-NEXT: s_sext_i32_i16 s3, s4
; GFX8-NEXT: s_min_i32 s1, s1, s3
@@ -3191,56 +3190,54 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
;
; GFX8-LABEL: s_saddsat_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_max_i32 s9, s8, 0
-; GFX8-NEXT: s_min_i32 s8, s8, 0
-; GFX8-NEXT: s_sub_i32 s8, 0x8000, s8
-; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
-; GFX8-NEXT: s_max_i32 s2, s8, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s8, s9
+; GFX8-NEXT: s_sext_i32_i16 s6, s0
+; GFX8-NEXT: s_max_i32 s7, s6, 0
+; GFX8-NEXT: s_min_i32 s6, s6, 0
+; GFX8-NEXT: s_sub_i32 s6, 0x8000, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_sext_i32_i16 s8, s2
+; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7
+; GFX8-NEXT: s_max_i32 s6, s6, s8
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_sext_i32_i16 s7, s7
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_min_i32 s2, s2, s8
-; GFX8-NEXT: s_add_i32 s0, s0, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
-; GFX8-NEXT: s_max_i32 s8, s2, 0
-; GFX8-NEXT: s_min_i32 s2, s2, 0
-; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_min_i32 s6, s6, s7
+; GFX8-NEXT: s_add_i32 s0, s0, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s4
+; GFX8-NEXT: s_max_i32 s7, s6, 0
+; GFX8-NEXT: s_min_i32 s6, s6, 0
+; GFX8-NEXT: s_sub_i32 s6, 0x8000, s6
; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8
-; GFX8-NEXT: s_max_i32 s2, s2, s6
+; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
+; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7
+; GFX8-NEXT: s_max_i32 s2, s6, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s6, s8
+; GFX8-NEXT: s_sext_i32_i16 s6, s7
; GFX8-NEXT: s_min_i32 s2, s2, s6
; GFX8-NEXT: s_add_i32 s4, s4, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s1
; GFX8-NEXT: s_max_i32 s6, s2, 0
; GFX8-NEXT: s_min_i32 s2, s2, 0
; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
-; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_sext_i32_i16 s7, s3
; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
-; GFX8-NEXT: s_max_i32 s2, s2, s3
+; GFX8-NEXT: s_max_i32 s2, s2, s7
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s3, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
-; GFX8-NEXT: s_min_i32 s2, s2, s3
+; GFX8-NEXT: s_min_i32 s2, s2, s6
; GFX8-NEXT: s_add_i32 s1, s1, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s5
-; GFX8-NEXT: s_max_i32 s3, s2, 0
+; GFX8-NEXT: s_max_i32 s6, s2, 0
; GFX8-NEXT: s_min_i32 s2, s2, 0
; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s6, s7
-; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
-; GFX8-NEXT: s_max_i32 s2, s2, s6
+; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
+; GFX8-NEXT: s_max_i32 s2, s2, s3
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_sext_i32_i16 s3, s6
; GFX8-NEXT: s_min_i32 s2, s2, s3
; GFX8-NEXT: s_add_i32 s5, s5, s2
; GFX8-NEXT: s_and_b32 s2, 0xffff, s4
@@ -3516,67 +3513,64 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
;
; GFX8-LABEL: s_saddsat_v6i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s12, s0
-; GFX8-NEXT: s_max_i32 s13, s12, 0
-; GFX8-NEXT: s_min_i32 s12, s12, 0
-; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12
-; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13
-; GFX8-NEXT: s_max_i32 s3, s12, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s12, s13
+; GFX8-NEXT: s_sext_i32_i16 s9, s0
+; GFX8-NEXT: s_max_i32 s10, s9, 0
+; GFX8-NEXT: s_min_i32 s9, s9, 0
+; GFX8-NEXT: s_sub_i32 s9, 0x8000, s9
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_sext_i32_i16 s11, s3
+; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
+; GFX8-NEXT: s_max_i32 s9, s9, s11
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_sext_i32_i16 s10, s10
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
-; GFX8-NEXT: s_min_i32 s3, s3, s12
-; GFX8-NEXT: s_add_i32 s0, s0, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s6
-; GFX8-NEXT: s_max_i32 s12, s3, 0
-; GFX8-NEXT: s_min_i32 s3, s3, 0
-; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_min_i32 s9, s9, s10
+; GFX8-NEXT: s_add_i32 s0, s0, s9
+; GFX8-NEXT: s_sext_i32_i16 s9, s6
+; GFX8-NEXT: s_max_i32 s10, s9, 0
+; GFX8-NEXT: s_min_i32 s9, s9, 0
+; GFX8-NEXT: s_sub_i32 s9, 0x8000, s9
; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12
-; GFX8-NEXT: s_max_i32 s3, s3, s9
+; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
+; GFX8-NEXT: s_max_i32 s3, s9, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s9, s12
+; GFX8-NEXT: s_sext_i32_i16 s9, s10
; GFX8-NEXT: s_min_i32 s3, s3, s9
; GFX8-NEXT: s_add_i32 s6, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s1
; GFX8-NEXT: s_max_i32 s9, s3, 0
; GFX8-NEXT: s_min_i32 s3, s3, 0
; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
-; GFX8-NEXT: s_lshr_b32 s10, s4, 16
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_sext_i32_i16 s10, s4
; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
-; GFX8-NEXT: s_max_i32 s3, s3, s4
+; GFX8-NEXT: s_max_i32 s3, s3, s10
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s4, s9
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
-; GFX8-NEXT: s_min_i32 s3, s3, s4
+; GFX8-NEXT: s_min_i32 s3, s3, s9
; GFX8-NEXT: s_add_i32 s1, s1, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s7
-; GFX8-NEXT: s_max_i32 s4, s3, 0
+; GFX8-NEXT: s_max_i32 s9, s3, 0
; GFX8-NEXT: s_min_i32 s3, s3, 0
; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s9, s10
-; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
-; GFX8-NEXT: s_max_i32 s3, s3, s9
+; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010
+; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
+; GFX8-NEXT: s_max_i32 s3, s3, s4
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_sext_i32_i16 s4, s9
; GFX8-NEXT: s_min_i32 s3, s3, s4
; GFX8-NEXT: s_add_i32 s7, s7, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s2
; GFX8-NEXT: s_max_i32 s4, s3, 0
; GFX8-NEXT: s_min_i32 s3, s3, 0
; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_sext_i32_i16 s9, s5
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
-; GFX8-NEXT: s_max_i32 s3, s3, s5
+; GFX8-NEXT: s_max_i32 s3, s3, s9
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_lshr_b32 s8, s2, 16
@@ -3587,7 +3581,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX8-NEXT: s_min_i32 s3, s3, 0
; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s5, s11
+; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
; GFX8-NEXT: s_max_i32 s3, s3, s5
; GFX8-NEXT: s_sext_i32_i16 s3, s3
@@ -3930,67 +3924,64 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
;
; GFX8-LABEL: s_saddsat_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s16, s0
-; GFX8-NEXT: s_max_i32 s17, s16, 0
-; GFX8-NEXT: s_min_i32 s16, s16, 0
-; GFX8-NEXT: s_sub_i32 s16, 0x8000, s16
-; GFX8-NEXT: s_lshr_b32 s12, s4, 16
-; GFX8-NEXT: s_sext_i32_i16 s16, s16
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sub_i32 s17, 0x7fff, s17
-; GFX8-NEXT: s_max_i32 s4, s16, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s16, s17
+; GFX8-NEXT: s_sext_i32_i16 s12, s0
+; GFX8-NEXT: s_max_i32 s13, s12, 0
+; GFX8-NEXT: s_min_i32 s12, s12, 0
+; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12
+; GFX8-NEXT: s_sext_i32_i16 s12, s12
+; GFX8-NEXT: s_sext_i32_i16 s14, s4
+; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13
+; GFX8-NEXT: s_max_i32 s12, s12, s14
+; GFX8-NEXT: s_sext_i32_i16 s12, s12
+; GFX8-NEXT: s_sext_i32_i16 s13, s13
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_min_i32 s4, s4, s16
-; GFX8-NEXT: s_add_i32 s0, s0, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s8
-; GFX8-NEXT: s_max_i32 s16, s4, 0
-; GFX8-NEXT: s_min_i32 s4, s4, 0
-; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_min_i32 s12, s12, s13
+; GFX8-NEXT: s_add_i32 s0, s0, s12
+; GFX8-NEXT: s_sext_i32_i16 s12, s8
+; GFX8-NEXT: s_max_i32 s13, s12, 0
+; GFX8-NEXT: s_min_i32 s12, s12, 0
+; GFX8-NEXT: s_sub_i32 s12, 0x8000, s12
; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16
-; GFX8-NEXT: s_max_i32 s4, s4, s12
+; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010
+; GFX8-NEXT: s_sub_i32 s13, 0x7fff, s13
+; GFX8-NEXT: s_max_i32 s4, s12, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s12, s16
+; GFX8-NEXT: s_sext_i32_i16 s12, s13
; GFX8-NEXT: s_min_i32 s4, s4, s12
; GFX8-NEXT: s_add_i32 s8, s8, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s1
; GFX8-NEXT: s_max_i32 s12, s4, 0
; GFX8-NEXT: s_min_i32 s4, s4, 0
; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
-; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_sext_i32_i16 s13, s5
; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12
-; GFX8-NEXT: s_max_i32 s4, s4, s5
+; GFX8-NEXT: s_max_i32 s4, s4, s13
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s5, s12
+; GFX8-NEXT: s_sext_i32_i16 s12, s12
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
-; GFX8-NEXT: s_min_i32 s4, s4, s5
+; GFX8-NEXT: s_min_i32 s4, s4, s12
; GFX8-NEXT: s_add_i32 s1, s1, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s9
-; GFX8-NEXT: s_max_i32 s5, s4, 0
+; GFX8-NEXT: s_max_i32 s12, s4, 0
; GFX8-NEXT: s_min_i32 s4, s4, 0
; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s12, s13
-; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
-; GFX8-NEXT: s_max_i32 s4, s4, s12
+; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010
+; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12
+; GFX8-NEXT: s_max_i32 s4, s4, s5
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_sext_i32_i16 s5, s12
; GFX8-NEXT: s_min_i32 s4, s4, s5
; GFX8-NEXT: s_add_i32 s9, s9, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s2
; GFX8-NEXT: s_max_i32 s5, s4, 0
; GFX8-NEXT: s_min_i32 s4, s4, 0
; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
-; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_sext_i32_i16 s12, s6
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
-; GFX8-NEXT: s_max_i32 s4, s4, s6
+; GFX8-NEXT: s_max_i32 s4, s4, s12
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
@@ -4001,7 +3992,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_min_i32 s4, s4, 0
; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s6, s14
+; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
; GFX8-NEXT: s_max_i32 s4, s4, s6
; GFX8-NEXT: s_sext_i32_i16 s4, s4
@@ -4024,10 +4015,9 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX8-NEXT: s_sext_i32_i16 s4, s11
; GFX8-NEXT: s_max_i32 s5, s4, 0
; GFX8-NEXT: s_min_i32 s4, s4, 0
-; GFX8-NEXT: s_lshr_b32 s15, s7, 16
; GFX8-NEXT: s_sub_i32 s4, 0x8000, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s6, s15
+; GFX8-NEXT: s_bfe_i32 s6, s7, 0x100010
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
; GFX8-NEXT: s_max_i32 s4, s4, s6
; GFX8-NEXT: s_sext_i32_i16 s4, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index c1b225562b77b..3b9ff6c2f1741 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -2838,31 +2838,30 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
;
; GFX8-LABEL: s_ssubsat_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s4, s0
-; GFX8-NEXT: s_max_i32 s5, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 0x8001
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_min_i32 s4, s4, -1
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
-; GFX8-NEXT: s_max_i32 s1, s5, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
+; GFX8-NEXT: s_sext_i32_i16 s3, s0
+; GFX8-NEXT: s_max_i32 s4, s3, -1
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8001
+; GFX8-NEXT: s_min_i32 s3, s3, -1
+; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_sext_i32_i16 s5, s1
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
+; GFX8-NEXT: s_max_i32 s4, s4, s5
; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_min_i32 s1, s1, s4
-; GFX8-NEXT: s_sub_i32 s0, s0, s1
-; GFX8-NEXT: s_sext_i32_i16 s1, s2
-; GFX8-NEXT: s_max_i32 s4, s1, -1
+; GFX8-NEXT: s_min_i32 s3, s4, s3
+; GFX8-NEXT: s_sub_i32 s0, s0, s3
+; GFX8-NEXT: s_sext_i32_i16 s3, s2
+; GFX8-NEXT: s_max_i32 s4, s3, -1
; GFX8-NEXT: s_add_i32 s4, s4, 0x8001
-; GFX8-NEXT: s_min_i32 s1, s1, -1
+; GFX8-NEXT: s_min_i32 s3, s3, -1
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_add_i32 s1, s1, 0x8000
-; GFX8-NEXT: s_max_i32 s3, s4, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
+; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
+; GFX8-NEXT: s_max_i32 s1, s4, s1
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_min_i32 s1, s3, s1
+; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_min_i32 s1, s1, s3
; GFX8-NEXT: s_sub_i32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3196,57 +3195,55 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
;
; GFX8-LABEL: s_ssubsat_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s8, s0
-; GFX8-NEXT: s_max_i32 s9, s8, -1
-; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
-; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_min_i32 s8, s8, -1
-; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
-; GFX8-NEXT: s_max_i32 s2, s9, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_min_i32 s2, s2, s8
-; GFX8-NEXT: s_sub_i32 s0, s0, s2
-; GFX8-NEXT: s_sext_i32_i16 s2, s4
-; GFX8-NEXT: s_max_i32 s8, s2, -1
-; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
-; GFX8-NEXT: s_min_i32 s2, s2, -1
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_add_i32 s2, s2, 0x8000
-; GFX8-NEXT: s_max_i32 s6, s8, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s0
+; GFX8-NEXT: s_max_i32 s7, s6, -1
+; GFX8-NEXT: s_add_i32 s7, s7, 0x8001
+; GFX8-NEXT: s_min_i32 s6, s6, -1
+; GFX8-NEXT: s_sext_i32_i16 s7, s7
+; GFX8-NEXT: s_sext_i32_i16 s8, s2
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
+; GFX8-NEXT: s_max_i32 s7, s7, s8
+; GFX8-NEXT: s_sext_i32_i16 s7, s7
; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NEXT: s_min_i32 s6, s7, s6
+; GFX8-NEXT: s_sub_i32 s0, s0, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s4
+; GFX8-NEXT: s_max_i32 s7, s6, -1
+; GFX8-NEXT: s_add_i32 s7, s7, 0x8001
+; GFX8-NEXT: s_min_i32 s6, s6, -1
+; GFX8-NEXT: s_sext_i32_i16 s7, s7
+; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
+; GFX8-NEXT: s_max_i32 s2, s7, s2
; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_min_i32 s2, s6, s2
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_min_i32 s2, s2, s6
; GFX8-NEXT: s_sub_i32 s2, s4, s2
; GFX8-NEXT: s_sext_i32_i16 s4, s1
; GFX8-NEXT: s_max_i32 s6, s4, -1
; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
-; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_min_i32 s4, s4, -1
; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
+; GFX8-NEXT: s_sext_i32_i16 s7, s3
+; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
+; GFX8-NEXT: s_max_i32 s6, s6, s7
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NEXT: s_min_i32 s4, s6, s4
+; GFX8-NEXT: s_sub_i32 s1, s1, s4
+; GFX8-NEXT: s_sext_i32_i16 s4, s5
+; GFX8-NEXT: s_max_i32 s6, s4, -1
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
+; GFX8-NEXT: s_min_i32 s4, s4, -1
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
; GFX8-NEXT: s_max_i32 s3, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: s_min_i32 s3, s3, s4
-; GFX8-NEXT: s_sub_i32 s1, s1, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s5
-; GFX8-NEXT: s_max_i32 s4, s3, -1
-; GFX8-NEXT: s_add_i32 s4, s4, 0x8001
-; GFX8-NEXT: s_min_i32 s3, s3, -1
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s6, s7
-; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
-; GFX8-NEXT: s_max_i32 s4, s4, s6
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_min_i32 s3, s4, s3
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_sub_i32 s3, s5, s3
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3521,86 +3518,83 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
;
; GFX8-LABEL: s_ssubsat_v6i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s12, s0
-; GFX8-NEXT: s_max_i32 s13, s12, -1
-; GFX8-NEXT: s_add_i32 s13, s13, 0x8001
-; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_min_i32 s12, s12, -1
-; GFX8-NEXT: s_sext_i32_i16 s13, s13
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_add_i32 s12, s12, 0x8000
-; GFX8-NEXT: s_max_i32 s3, s13, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_lshr_b32 s6, s0, 16
-; GFX8-NEXT: s_min_i32 s3, s3, s12
-; GFX8-NEXT: s_sub_i32 s0, s0, s3
-; GFX8-NEXT: s_sext_i32_i16 s3, s6
-; GFX8-NEXT: s_max_i32 s12, s3, -1
-; GFX8-NEXT: s_add_i32 s12, s12, 0x8001
-; GFX8-NEXT: s_min_i32 s3, s3, -1
-; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_add_i32 s3, s3, 0x8000
-; GFX8-NEXT: s_max_i32 s9, s12, s9
+; GFX8-NEXT: s_sext_i32_i16 s9, s0
+; GFX8-NEXT: s_max_i32 s10, s9, -1
+; GFX8-NEXT: s_add_i32 s10, s10, 0x8001
+; GFX8-NEXT: s_min_i32 s9, s9, -1
+; GFX8-NEXT: s_sext_i32_i16 s10, s10
+; GFX8-NEXT: s_sext_i32_i16 s11, s3
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8000
+; GFX8-NEXT: s_max_i32 s10, s10, s11
+; GFX8-NEXT: s_sext_i32_i16 s10, s10
; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_lshr_b32 s6, s0, 16
+; GFX8-NEXT: s_min_i32 s9, s10, s9
+; GFX8-NEXT: s_sub_i32 s0, s0, s9
+; GFX8-NEXT: s_sext_i32_i16 s9, s6
+; GFX8-NEXT: s_max_i32 s10, s9, -1
+; GFX8-NEXT: s_add_i32 s10, s10, 0x8001
+; GFX8-NEXT: s_min_i32 s9, s9, -1
+; GFX8-NEXT: s_sext_i32_i16 s10, s10
+; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8000
+; GFX8-NEXT: s_max_i32 s3, s10, s3
; GFX8-NEXT: s_sext_i32_i16 s3, s3
-; GFX8-NEXT: s_min_i32 s3, s9, s3
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_min_i32 s3, s3, s9
; GFX8-NEXT: s_sub_i32 s3, s6, s3
; GFX8-NEXT: s_sext_i32_i16 s6, s1
; GFX8-NEXT: s_max_i32 s9, s6, -1
; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
-; GFX8-NEXT: s_lshr_b32 s10, s4, 16
; GFX8-NEXT: s_min_i32 s6, s6, -1
; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
+; GFX8-NEXT: s_sext_i32_i16 s10, s4
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
+; GFX8-NEXT: s_max_i32 s9, s9, s10
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_lshr_b32 s7, s1, 16
+; GFX8-NEXT: s_min_i32 s6, s9, s6
+; GFX8-NEXT: s_sub_i32 s1, s1, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s7
+; GFX8-NEXT: s_max_i32 s9, s6, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
+; GFX8-NEXT: s_min_i32 s6, s6, -1
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010
; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
; GFX8-NEXT: s_max_i32 s4, s9, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_min_i32 s4, s4, s6
-; GFX8-NEXT: s_sub_i32 s1, s1, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s7
-; GFX8-NEXT: s_max_i32 s6, s4, -1
-; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
-; GFX8-NEXT: s_min_i32 s4, s4, -1
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_sext_i32_i16 s9, s10
-; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
-; GFX8-NEXT: s_max_i32 s6, s6, s9
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_min_i32 s4, s6, s4
; GFX8-NEXT: s_sext_i32_i16 s6, s2
; GFX8-NEXT: s_sub_i32 s4, s7, s4
; GFX8-NEXT: s_max_i32 s7, s6, -1
; GFX8-NEXT: s_add_i32 s7, s7, 0x8001
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
; GFX8-NEXT: s_min_i32 s6, s6, -1
; GFX8-NEXT: s_sext_i32_i16 s7, s7
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_sext_i32_i16 s9, s5
; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
-; GFX8-NEXT: s_max_i32 s5, s7, s5
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_max_i32 s7, s7, s9
+; GFX8-NEXT: s_sext_i32_i16 s7, s7
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NEXT: s_min_i32 s5, s5, s6
-; GFX8-NEXT: s_sub_i32 s2, s2, s5
-; GFX8-NEXT: s_sext_i32_i16 s5, s8
-; GFX8-NEXT: s_max_i32 s6, s5, -1
-; GFX8-NEXT: s_add_i32 s6, s6, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, -1
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_sext_i32_i16 s7, s11
-; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
-; GFX8-NEXT: s_max_i32 s6, s6, s7
+; GFX8-NEXT: s_min_i32 s6, s7, s6
+; GFX8-NEXT: s_sub_i32 s2, s2, s6
+; GFX8-NEXT: s_sext_i32_i16 s6, s8
+; GFX8-NEXT: s_max_i32 s7, s6, -1
+; GFX8-NEXT: s_add_i32 s7, s7, 0x8001
+; GFX8-NEXT: s_min_i32 s6, s6, -1
+; GFX8-NEXT: s_sext_i32_i16 s7, s7
+; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010
+; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
+; GFX8-NEXT: s_max_i32 s5, s7, s5
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NEXT: s_min_i32 s5, s6, s5
+; GFX8-NEXT: s_min_i32 s5, s5, s6
; GFX8-NEXT: s_or_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
; GFX8-NEXT: s_sub_i32 s5, s8, s5
@@ -3935,116 +3929,112 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
;
; GFX8-LABEL: s_ssubsat_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s16, s0
-; GFX8-NEXT: s_max_i32 s17, s16, -1
-; GFX8-NEXT: s_add_i32 s17, s17, 0x8001
-; GFX8-NEXT: s_lshr_b32 s12, s4, 16
-; GFX8-NEXT: s_min_i32 s16, s16, -1
-; GFX8-NEXT: s_sext_i32_i16 s17, s17
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_add_i32 s16, s16, 0x8000
-; GFX8-NEXT: s_max_i32 s4, s17, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_sext_i32_i16 s16, s16
-; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_min_i32 s4, s4, s16
-; GFX8-NEXT: s_sub_i32 s0, s0, s4
-; GFX8-NEXT: s_sext_i32_i16 s4, s8
-; GFX8-NEXT: s_max_i32 s16, s4, -1
-; GFX8-NEXT: s_add_i32 s16, s16, 0x8001
-; GFX8-NEXT: s_min_i32 s4, s4, -1
-; GFX8-NEXT: s_sext_i32_i16 s16, s16
-; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_add_i32 s4, s4, 0x8000
-; GFX8-NEXT: s_max_i32 s12, s16, s12
+; GFX8-NEXT: s_sext_i32_i16 s12, s0
+; GFX8-NEXT: s_max_i32 s13, s12, -1
+; GFX8-NEXT: s_add_i32 s13, s13, 0x8001
+; GFX8-NEXT: s_min_i32 s12, s12, -1
+; GFX8-NEXT: s_sext_i32_i16 s13, s13
+; GFX8-NEXT: s_sext_i32_i16 s14, s4
+; GFX8-NEXT: s_add_i32 s12, s12, 0x8000
+; GFX8-NEXT: s_max_i32 s13, s13, s14
+; GFX8-NEXT: s_sext_i32_i16 s13, s13
; GFX8-NEXT: s_sext_i32_i16 s12, s12
+; GFX8-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NEXT: s_min_i32 s12, s13, s12
+; GFX8-NEXT: s_sub_i32 s0, s0, s12
+; GFX8-NEXT: s_sext_i32_i16 s12, s8
+; GFX8-NEXT: s_max_i32 s13, s12, -1
+; GFX8-NEXT: s_add_i32 s13, s13, 0x8001
+; GFX8-NEXT: s_min_i32 s12, s12, -1
+; GFX8-NEXT: s_sext_i32_i16 s13, s13
+; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010
+; GFX8-NEXT: s_add_i32 s12, s12, 0x8000
+; GFX8-NEXT: s_max_i32 s4, s13, s4
; GFX8-NEXT: s_sext_i32_i16 s4, s4
-; GFX8-NEXT: s_min_i32 s4, s12, s4
+; GFX8-NEXT: s_sext_i32_i16 s12, s12
+; GFX8-NEXT: s_min_i32 s4, s4, s12
; GFX8-NEXT: s_sub_i32 s4, s8, s4
; GFX8-NEXT: s_sext_i32_i16 s8, s1
; GFX8-NEXT: s_max_i32 s12, s8, -1
; GFX8-NEXT: s_add_i32 s12, s12, 0x8001
-; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_min_i32 s8, s8, -1
; GFX8-NEXT: s_sext_i32_i16 s12, s12
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
+; GFX8-NEXT: s_sext_i32_i16 s13, s5
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
+; GFX8-NEXT: s_max_i32 s12, s12, s13
+; GFX8-NEXT: s_sext_i32_i16 s12, s12
+; GFX8-NEXT: s_sext_i32_i16 s8, s8
+; GFX8-NEXT: s_lshr_b32 s9, s1, 16
+; GFX8-NEXT: s_min_i32 s8, s12, s8
+; GFX8-NEXT: s_sub_i32 s1, s1, s8
+; GFX8-NEXT: s_sext_i32_i16 s8, s9
+; GFX8-NEXT: s_max_i32 s12, s8, -1
+; GFX8-NEXT: s_add_i32 s12, s12, 0x8001
+; GFX8-NEXT: s_min_i32 s8, s8, -1
+; GFX8-NEXT: s_sext_i32_i16 s12, s12
+; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010
; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
; GFX8-NEXT: s_max_i32 s5, s12, s5
; GFX8-NEXT: s_sext_i32_i16 s5, s5
; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NEXT: s_min_i32 s5, s5, s8
-; GFX8-NEXT: s_sub_i32 s1, s1, s5
-; GFX8-NEXT: s_sext_i32_i16 s5, s9
-; GFX8-NEXT: s_max_i32 s8, s5, -1
-; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
-; GFX8-NEXT: s_min_i32 s5, s5, -1
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s12, s13
-; GFX8-NEXT: s_add_i32 s5, s5, 0x8000
-; GFX8-NEXT: s_max_i32 s8, s8, s12
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s5, s5
-; GFX8-NEXT: s_min_i32 s5, s8, s5
; GFX8-NEXT: s_sext_i32_i16 s8, s2
; GFX8-NEXT: s_sub_i32 s5, s9, s5
; GFX8-NEXT: s_max_i32 s9, s8, -1
; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
-; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_min_i32 s8, s8, -1
; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
+; GFX8-NEXT: s_sext_i32_i16 s12, s6
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
+; GFX8-NEXT: s_max_i32 s9, s9, s12
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_sext_i32_i16 s8, s8
+; GFX8-NEXT: s_lshr_b32 s10, s2, 16
+; GFX8-NEXT: s_min_i32 s8, s9, s8
+; GFX8-NEXT: s_sub_i32 s2, s2, s8
+; GFX8-NEXT: s_sext_i32_i16 s8, s10
+; GFX8-NEXT: s_max_i32 s9, s8, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
+; GFX8-NEXT: s_min_i32 s8, s8, -1
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010
; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
; GFX8-NEXT: s_max_i32 s6, s9, s6
; GFX8-NEXT: s_sext_i32_i16 s6, s6
; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_min_i32 s6, s6, s8
-; GFX8-NEXT: s_sub_i32 s2, s2, s6
-; GFX8-NEXT: s_sext_i32_i16 s6, s10
-; GFX8-NEXT: s_max_i32 s8, s6, -1
-; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
-; GFX8-NEXT: s_min_i32 s6, s6, -1
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s9, s14
-; GFX8-NEXT: s_add_i32 s6, s6, 0x8000
-; GFX8-NEXT: s_max_i32 s8, s8, s9
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s6, s6
-; GFX8-NEXT: s_min_i32 s6, s8, s6
; GFX8-NEXT: s_sext_i32_i16 s8, s3
; GFX8-NEXT: s_max_i32 s9, s8, -1
; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
-; GFX8-NEXT: s_lshr_b32 s15, s7, 16
+; GFX8-NEXT: s_sub_i32 s6, s10, s6
; GFX8-NEXT: s_min_i32 s8, s8, -1
; GFX8-NEXT: s_sext_i32_i16 s9, s9
-; GFX8-NEXT: s_sext_i32_i16 s7, s7
+; GFX8-NEXT: s_sext_i32_i16 s10, s7
; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
-; GFX8-NEXT: s_max_i32 s7, s9, s7
-; GFX8-NEXT: s_sext_i32_i16 s7, s7
+; GFX8-NEXT: s_max_i32 s9, s9, s10
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
-; GFX8-NEXT: s_min_i32 s7, s7, s8
-; GFX8-NEXT: s_sub_i32 s3, s3, s7
-; GFX8-NEXT: s_sext_i32_i16 s7, s11
-; GFX8-NEXT: s_max_i32 s8, s7, -1
-; GFX8-NEXT: s_add_i32 s8, s8, 0x8001
+; GFX8-NEXT: s_min_i32 s8, s9, s8
+; GFX8-NEXT: s_sub_i32 s3, s3, s8
+; GFX8-NEXT: s_sext_i32_i16 s8, s11
+; GFX8-NEXT: s_max_i32 s9, s8, -1
+; GFX8-NEXT: s_add_i32 s9, s9, 0x8001
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_min_i32 s7, s7, -1
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
-; GFX8-NEXT: s_sext_i32_i16 s9, s15
+; GFX8-NEXT: s_min_i32 s8, s8, -1
+; GFX8-NEXT: s_sext_i32_i16 s9, s9
+; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_add_i32 s7, s7, 0x8000
-; GFX8-NEXT: s_max_i32 s8, s8, s9
+; GFX8-NEXT: s_add_i32 s8, s8, 0x8000
+; GFX8-NEXT: s_max_i32 s7, s9, s7
; GFX8-NEXT: s_or_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
-; GFX8-NEXT: s_sub_i32 s6, s10, s6
-; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_sext_i32_i16 s7, s7
+; GFX8-NEXT: s_sext_i32_i16 s8, s8
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_min_i32 s7, s8, s7
+; GFX8-NEXT: s_min_i32 s7, s7, s8
; GFX8-NEXT: s_or_b32 s1, s1, s4
; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
; GFX8-NEXT: s_sub_i32 s7, s11, s7
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 99b6ab7a6401b..38f2310b5c62d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -736,19 +736,18 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: s_lshr_b32 s3, s2, 16
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_max_i32 s3, s3, 0
+; GISEL-VI-NEXT: s_sext_i32_i16 s3, s2
+; GISEL-VI-NEXT: s_bfe_i32 s2, s2, 0x100010
; GISEL-VI-NEXT: s_max_i32 s2, s2, 0
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
+; GISEL-VI-NEXT: s_max_i32 s3, s3, 0
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff
+; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
; GISEL-VI-NEXT: s_min_i32 s2, s2, 0xff
-; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
+; GISEL-VI-NEXT: s_min_i32 s3, s3, 0xff
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
-; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
-; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
+; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
+; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16
+; GISEL-VI-NEXT: s_or_b32 s2, s3, s2
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
>From 5619d1c7581c1f55fcdb0e6c9b7e2f0dd2ee5f3c Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 4 Dec 2025 10:45:23 +0100
Subject: [PATCH 2/2] Rebase + Fix issues due to new RBSelect
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 22 ++-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 178 +++++++++---------
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 18 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 11 +-
4 files changed, 122 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index eb52154df4bb5..c602d9044099f 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4760,7 +4760,10 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits())
return false;
- const RegisterBank *RB = getRegBank(ShiftSrc);
+ // FIXME: Propagate RBs better!
+ const RegisterBank *RB = nullptr;
+ if (MI.getMF()->getProperties().hasRegBankSelected())
+ RB = getRegBank(ShiftSrc);
MatchInfo = [=](MachineIRBuilder &B) {
auto Cst1 = B.buildConstant(ExtractTy, ShiftImm);
@@ -4806,7 +4809,10 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI,
uint64_t Width = APInt(Size, AndImm).countr_one();
- const RegisterBank *RB = getRegBank(ShiftSrc);
+ // FIXME: Propagate RBs better!
+ const RegisterBank *RB = nullptr;
+ if (MI.getMF()->getProperties().hasRegBankSelected())
+ RB = getRegBank(ShiftSrc);
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
@@ -4863,7 +4869,10 @@ bool CombinerHelper::matchBitfieldExtractFromShr(
const int64_t Pos = ShrAmt - ShlAmt;
const int64_t Width = Size - ShrAmt;
- const RegisterBank *RB = getRegBank(ShlSrc);
+ // FIXME: Propagate RBs better!
+ const RegisterBank *RB = nullptr;
+ if (MI.getMF()->getProperties().hasRegBankSelected())
+ RB = getRegBank(ShlSrc);
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
@@ -4928,7 +4937,10 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
return false;
- const RegisterBank *RB = getRegBank(AndSrc);
+ // FIXME: Propagate RBs better!
+ const RegisterBank *RB = nullptr;
+ if (MI.getMF()->getProperties().hasRegBankSelected())
+ RB = getRegBank(AndSrc);
MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);
@@ -8540,4 +8552,4 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
}
return false;
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 1b96f2f840938..4145bf37f39ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -916,39 +916,39 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX8-LABEL: s_fshl_v4i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s11, s2, 7
+; GFX8-NEXT: s_and_b32 s9, s2, 7
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
-; GFX8-NEXT: s_lshl_b32 s0, s0, s11
-; GFX8-NEXT: s_and_b32 s11, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s8, s2, 8
-; GFX8-NEXT: s_lshr_b32 s9, s2, 16
-; GFX8-NEXT: s_lshr_b32 s10, s2, 24
-; GFX8-NEXT: s_lshr_b32 s11, s11, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, s9
+; GFX8-NEXT: s_bfe_u32 s9, s1, 0x70001
+; GFX8-NEXT: s_lshr_b32 s6, s2, 8
+; GFX8-NEXT: s_lshr_b32 s7, s2, 16
+; GFX8-NEXT: s_lshr_b32 s8, s2, 24
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_lshr_b32 s2, s11, s2
-; GFX8-NEXT: s_lshr_b32 s6, s1, 8
+; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX8-NEXT: s_lshr_b32 s2, s9, s2
; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: s_and_b32 s2, s8, 7
+; GFX8-NEXT: s_and_b32 s2, s6, 7
; GFX8-NEXT: s_lshl_b32 s2, s3, s2
-; GFX8-NEXT: s_and_b32 s3, s6, 0xff
+; GFX8-NEXT: s_bfe_u32 s3, s1, 0x80008
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_andn2_b32 s6, 7, s8
+; GFX8-NEXT: s_andn2_b32 s6, 7, s6
; GFX8-NEXT: s_lshr_b32 s3, s3, s6
-; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_or_b32 s2, s2, s3
-; GFX8-NEXT: s_and_b32 s3, s9, 7
+; GFX8-NEXT: s_and_b32 s3, s7, 7
; GFX8-NEXT: s_lshl_b32 s3, s4, s3
-; GFX8-NEXT: s_and_b32 s4, s7, 0xff
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80010
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s4, s4, 1
-; GFX8-NEXT: s_andn2_b32 s6, 7, s9
+; GFX8-NEXT: s_andn2_b32 s6, 7, s7
; GFX8-NEXT: s_lshr_b32 s4, s4, s6
; GFX8-NEXT: s_or_b32 s3, s3, s4
-; GFX8-NEXT: s_and_b32 s4, s10, 7
+; GFX8-NEXT: s_and_b32 s4, s8, 7
; GFX8-NEXT: s_lshl_b32 s4, s5, s4
; GFX8-NEXT: s_lshr_b32 s1, s1, 25
-; GFX8-NEXT: s_andn2_b32 s5, 7, s10
+; GFX8-NEXT: s_andn2_b32 s5, 7, s8
; GFX8-NEXT: s_and_b32 s2, s2, 0xff
; GFX8-NEXT: s_lshr_b32 s1, s1, s5
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
@@ -965,39 +965,39 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX9-LABEL: s_fshl_v4i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s11, s2, 7
+; GFX9-NEXT: s_and_b32 s9, s2, 7
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
-; GFX9-NEXT: s_lshl_b32 s0, s0, s11
-; GFX9-NEXT: s_and_b32 s11, s1, 0xff
-; GFX9-NEXT: s_lshr_b32 s8, s2, 8
-; GFX9-NEXT: s_lshr_b32 s9, s2, 16
-; GFX9-NEXT: s_lshr_b32 s10, s2, 24
-; GFX9-NEXT: s_lshr_b32 s11, s11, 1
+; GFX9-NEXT: s_lshl_b32 s0, s0, s9
+; GFX9-NEXT: s_bfe_u32 s9, s1, 0x70001
+; GFX9-NEXT: s_lshr_b32 s6, s2, 8
+; GFX9-NEXT: s_lshr_b32 s7, s2, 16
+; GFX9-NEXT: s_lshr_b32 s8, s2, 24
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_lshr_b32 s2, s11, s2
-; GFX9-NEXT: s_lshr_b32 s6, s1, 8
+; GFX9-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX9-NEXT: s_lshr_b32 s2, s9, s2
; GFX9-NEXT: s_or_b32 s0, s0, s2
-; GFX9-NEXT: s_and_b32 s2, s8, 7
+; GFX9-NEXT: s_and_b32 s2, s6, 7
; GFX9-NEXT: s_lshl_b32 s2, s3, s2
-; GFX9-NEXT: s_and_b32 s3, s6, 0xff
+; GFX9-NEXT: s_bfe_u32 s3, s1, 0x80008
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_andn2_b32 s6, 7, s8
+; GFX9-NEXT: s_andn2_b32 s6, 7, s6
; GFX9-NEXT: s_lshr_b32 s3, s3, s6
-; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_or_b32 s2, s2, s3
-; GFX9-NEXT: s_and_b32 s3, s9, 7
+; GFX9-NEXT: s_and_b32 s3, s7, 7
; GFX9-NEXT: s_lshl_b32 s3, s4, s3
-; GFX9-NEXT: s_and_b32 s4, s7, 0xff
+; GFX9-NEXT: s_bfe_u32 s4, s1, 0x80010
+; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_lshr_b32 s4, s4, 1
-; GFX9-NEXT: s_andn2_b32 s6, 7, s9
+; GFX9-NEXT: s_andn2_b32 s6, 7, s7
; GFX9-NEXT: s_lshr_b32 s4, s4, s6
; GFX9-NEXT: s_or_b32 s3, s3, s4
-; GFX9-NEXT: s_and_b32 s4, s10, 7
+; GFX9-NEXT: s_and_b32 s4, s8, 7
; GFX9-NEXT: s_lshl_b32 s4, s5, s4
; GFX9-NEXT: s_lshr_b32 s1, s1, 25
-; GFX9-NEXT: s_andn2_b32 s5, 7, s10
+; GFX9-NEXT: s_andn2_b32 s5, 7, s8
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: s_lshr_b32 s1, s1, s5
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
@@ -1014,37 +1014,37 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX10-LABEL: s_fshl_v4i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s6, s1, 8
-; GFX10-NEXT: s_and_b32 s11, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s8, s2, 8
-; GFX10-NEXT: s_lshr_b32 s9, s2, 16
-; GFX10-NEXT: s_lshr_b32 s10, s2, 24
-; GFX10-NEXT: s_and_b32 s12, s2, 7
-; GFX10-NEXT: s_lshr_b32 s11, s11, 1
+; GFX10-NEXT: s_bfe_u32 s10, s1, 0x70001
+; GFX10-NEXT: s_lshr_b32 s6, s2, 8
+; GFX10-NEXT: s_lshr_b32 s7, s2, 16
+; GFX10-NEXT: s_lshr_b32 s8, s2, 24
+; GFX10-NEXT: s_and_b32 s9, s2, 7
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_and_b32 s10, 0xffff, s10
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s2, s11, s2
-; GFX10-NEXT: s_and_b32 s11, s8, 7
-; GFX10-NEXT: s_lshr_b32 s6, s6, 1
-; GFX10-NEXT: s_andn2_b32 s8, 7, s8
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
-; GFX10-NEXT: s_lshr_b32 s7, s1, 16
-; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s3, s3, s11
-; GFX10-NEXT: s_lshr_b32 s6, s6, s8
+; GFX10-NEXT: s_lshl_b32 s0, s0, s9
+; GFX10-NEXT: s_lshr_b32 s2, s10, s2
+; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008
; GFX10-NEXT: s_or_b32 s0, s0, s2
-; GFX10-NEXT: s_or_b32 s2, s3, s6
-; GFX10-NEXT: s_and_b32 s3, s7, 0xff
-; GFX10-NEXT: s_and_b32 s6, s9, 7
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s9
+; GFX10-NEXT: s_and_b32 s9, s6, 7
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
+; GFX10-NEXT: s_andn2_b32 s6, 7, s6
+; GFX10-NEXT: s_lshl_b32 s3, s3, s9
+; GFX10-NEXT: s_lshr_b32 s2, s2, s6
+; GFX10-NEXT: s_bfe_u32 s6, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX10-NEXT: s_and_b32 s6, s7, 7
; GFX10-NEXT: s_lshr_b32 s3, s3, 1
-; GFX10-NEXT: s_andn2_b32 s7, 7, s9
+; GFX10-NEXT: s_andn2_b32 s7, 7, s7
; GFX10-NEXT: s_lshl_b32 s4, s4, s6
; GFX10-NEXT: s_lshr_b32 s3, s3, s7
-; GFX10-NEXT: s_and_b32 s6, s10, 7
+; GFX10-NEXT: s_and_b32 s6, s8, 7
; GFX10-NEXT: s_lshr_b32 s1, s1, 25
-; GFX10-NEXT: s_andn2_b32 s7, 7, s10
+; GFX10-NEXT: s_andn2_b32 s7, 7, s8
; GFX10-NEXT: s_lshl_b32 s5, s5, s6
; GFX10-NEXT: s_lshr_b32 s1, s1, s7
; GFX10-NEXT: s_or_b32 s3, s4, s3
@@ -1063,37 +1063,37 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX11-LABEL: s_fshl_v4i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s11, s1, 0xff
-; GFX11-NEXT: s_lshr_b32 s8, s2, 8
-; GFX11-NEXT: s_lshr_b32 s9, s2, 16
-; GFX11-NEXT: s_lshr_b32 s10, s2, 24
-; GFX11-NEXT: s_and_b32 s12, s2, 7
-; GFX11-NEXT: s_lshr_b32 s11, s11, 1
+; GFX11-NEXT: s_bfe_u32 s10, s1, 0x70001
+; GFX11-NEXT: s_lshr_b32 s6, s2, 8
+; GFX11-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-NEXT: s_lshr_b32 s8, s2, 24
+; GFX11-NEXT: s_and_b32 s9, s2, 7
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-NEXT: s_and_b32 s10, 0xffff, s10
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s2, s11, s2
-; GFX11-NEXT: s_and_b32 s11, s8, 7
-; GFX11-NEXT: s_lshr_b32 s6, s6, 1
-; GFX11-NEXT: s_and_not1_b32 s8, 7, s8
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshr_b32 s7, s1, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s3, s3, s11
-; GFX11-NEXT: s_lshr_b32 s6, s6, s8
+; GFX11-NEXT: s_lshl_b32 s0, s0, s9
+; GFX11-NEXT: s_lshr_b32 s2, s10, s2
+; GFX11-NEXT: s_bfe_u32 s9, s1, 0x80008
; GFX11-NEXT: s_or_b32 s0, s0, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s6
-; GFX11-NEXT: s_and_b32 s3, s7, 0xff
-; GFX11-NEXT: s_and_b32 s6, s9, 7
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s9
+; GFX11-NEXT: s_and_b32 s9, s6, 7
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_and_not1_b32 s6, 7, s6
+; GFX11-NEXT: s_lshl_b32 s3, s3, s9
+; GFX11-NEXT: s_lshr_b32 s2, s2, s6
+; GFX11-NEXT: s_bfe_u32 s6, s1, 0x80010
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX11-NEXT: s_and_b32 s6, s7, 7
; GFX11-NEXT: s_lshr_b32 s3, s3, 1
-; GFX11-NEXT: s_and_not1_b32 s7, 7, s9
+; GFX11-NEXT: s_and_not1_b32 s7, 7, s7
; GFX11-NEXT: s_lshl_b32 s4, s4, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, s7
-; GFX11-NEXT: s_and_b32 s6, s10, 7
+; GFX11-NEXT: s_and_b32 s6, s8, 7
; GFX11-NEXT: s_lshr_b32 s1, s1, 25
-; GFX11-NEXT: s_and_not1_b32 s7, 7, s10
+; GFX11-NEXT: s_and_not1_b32 s7, 7, s8
; GFX11-NEXT: s_lshl_b32 s5, s5, s6
; GFX11-NEXT: s_lshr_b32 s1, s1, s7
; GFX11-NEXT: s_or_b32 s3, s4, s3
@@ -2137,15 +2137,15 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_and_b32 s11, s4, 0xff
; GFX11-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
-; GFX11-NEXT: s_and_b32 s11, s11, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s10, 8
-; GFX11-NEXT: s_lshl_b32 s11, s11, 16
-; GFX11-NEXT: s_or_b32 s4, s4, s10
-; GFX11-NEXT: s_lshr_b32 s13, s5, 8
-; GFX11-NEXT: s_and_b32 s5, s5, 0xff
-; GFX11-NEXT: s_or_b32 s4, s4, s11
-; GFX11-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-NEXT: s_lshl_b32 s12, s12, 8
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_or_b32 s11, s11, s12
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX11-NEXT: s_and_b32 s13, s5, 0xff
+; GFX11-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX11-NEXT: s_or_b32 s4, s11, s4
+; GFX11-NEXT: s_lshl_b32 s13, s13, 8
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index a68916135013a..e4193c33227bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -2163,15 +2163,15 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX11-NEXT: s_and_b32 s14, s4, 0xff
; GFX11-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
-; GFX11-NEXT: s_and_b32 s15, s15, 0xff
-; GFX11-NEXT: s_lshl_b32 s14, s14, 8
-; GFX11-NEXT: s_lshl_b32 s15, s15, 16
-; GFX11-NEXT: s_or_b32 s4, s4, s14
-; GFX11-NEXT: s_lshr_b32 s17, s5, 8
-; GFX11-NEXT: s_and_b32 s5, s5, 0xff
-; GFX11-NEXT: s_or_b32 s4, s4, s15
-; GFX11-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-NEXT: s_lshl_b32 s15, s15, 8
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_or_b32 s14, s14, s15
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: s_and_b32 s14, 0xffff, s14
+; GFX11-NEXT: s_and_b32 s16, s5, 0xff
+; GFX11-NEXT: s_bfe_u32 s5, s5, 0x80008
+; GFX11-NEXT: s_or_b32 s4, s14, s4
+; GFX11-NEXT: s_lshl_b32 s16, s16, 8
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 7bb9ba1f184da..e94abafcb7585 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -528,8 +528,10 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
; GFX8-NEXT: s_sext_i32_i16 s1, s0
; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_abs_i32 s0, s0
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: abs_sgpr_v2i16:
@@ -613,11 +615,12 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
; GFX8-NEXT: s_sext_i32_i16 s2, s0
; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_abs_i32 s2, s2
; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_abs_i32 s1, s1
-; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: abs_sgpr_v3i16:
More information about the llvm-branch-commits
mailing list