[llvm] 1300677 - [AArch64][GlobalISel] combine and + [la]sr => ubfx

Jon Roelofs via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 18 10:42:19 PDT 2021


Author: Jon Roelofs
Date: 2021-10-18T10:33:01-07:00
New Revision: 1300677f976e9975c21fd7bc4aa4e38e1bffeb27

URL: https://github.com/llvm/llvm-project/commit/1300677f976e9975c21fd7bc4aa4e38e1bffeb27
DIFF: https://github.com/llvm/llvm-project/commit/1300677f976e9975c21fd7bc4aa4e38e1bffeb27.diff

LOG: [AArch64][GlobalISel] combine and + [la]sr => ubfx

https://godbolt.org/z/h8ejrG4hb

rdar://83597585

Differential Revision: https://reviews.llvm.org/D111839

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index f9b9ded11ba84..cc07b8e7042d5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -583,6 +583,9 @@ class CombinerHelper {
   /// Match: shr (shl x, n), k -> sbfx/ubfx x, pos, width
   bool matchBitfieldExtractFromShr(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Match: shr (and x, n), k -> ubfx x, pos, width
+  bool matchBitfieldExtractFromShrAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   // Helpers for reassociation:
   bool matchReassocConstantInnerRHS(GPtrAdd &MI, MachineInstr *RHS,
                                     BuildFnTy &MatchInfo);

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9087c136bb02f..87e827dca3047 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -691,9 +691,16 @@ def bitfield_extract_from_shr : GICombineRule<
     [{ return Helper.matchBitfieldExtractFromShr(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def bitfield_extract_from_shr_and : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_ASHR, G_LSHR):$root,
+    [{ return Helper.matchBitfieldExtractFromShrAnd(*${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
 def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
                                             bitfield_extract_from_and,
-                                            bitfield_extract_from_shr]>;
+                                            bitfield_extract_from_shr,
+                                            bitfield_extract_from_shr_and]>;
 
 def udiv_by_const : GICombineRule<
   (defs root:$root),

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index a3b2cc4dcd989..92cc28797a1a0 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4124,6 +4124,55 @@ bool CombinerHelper::matchBitfieldExtractFromShr(
   return true;
 }
 
+bool CombinerHelper::matchBitfieldExtractFromShrAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  const unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_ASHR);
+
+  const Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal(
+          TargetOpcode::G_UBFX, Ty, Ty))
+    return false;
+
+  // Try to match shr (and x, c1), c2
+  Register AndSrc;
+  int64_t ShrAmt;
+  int64_t SMask;
+  if (!mi_match(Dst, MRI,
+                m_BinOp(Opcode,
+                        m_OneNonDBGUse(m_GAnd(m_Reg(AndSrc), m_ICst(SMask))),
+                        m_ICst(ShrAmt))))
+    return false;
+
+  const unsigned Size = Ty.getScalarSizeInBits();
+  if (ShrAmt < 0 || ShrAmt >= Size)
+    return false;
+
+  // Check that ubfx can do the extraction, with no holes in the mask.
+  uint64_t UMask = SMask;
+  UMask |= maskTrailingOnes<uint64_t>(ShrAmt);
+  UMask &= maskTrailingOnes<uint64_t>(Size);
+  if (!isMask_64(UMask))
+    return false;
+
+  // Calculate start position and width of the extract.
+  const int64_t Pos = ShrAmt;
+  const int64_t Width = countTrailingOnes(UMask) - ShrAmt;
+
+  // It's preferable to keep the shift, rather than form G_SBFX.
+  // TODO: remove the G_AND via demanded bits analysis.
+  if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto WidthCst = B.buildConstant(Ty, Width);
+    auto PosCst = B.buildConstant(Ty, Pos);
+    B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
+  };
+  return true;
+}
+
 bool CombinerHelper::reassociationCanBreakAddressingModePattern(
     MachineInstr &PtrAdd) {
   assert(PtrAdd.getOpcode() == TargetOpcode::G_PTR_ADD);

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir
new file mode 100644
index 0000000000000..55a75b1d03fda
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir
@@ -0,0 +1,190 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+# Check that we can fold a G_ASHR/G_LSHR fed by a G_AND into a G_SBFX/G_UBFX.
+
+---
+name:            mask_extract_unsigned_32
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: mask_extract_unsigned_32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s32) = G_UBFX [[COPY]], [[C]](s32), [[C]]
+    ; CHECK-NEXT: $w0 = COPY [[UBFX]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 12
+    %2:_(s32) = G_CONSTANT i32 2
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_LSHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:            mask_extract_unsigned_64
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: mask_extract_unsigned_64
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[COPY]], [[C]](s64), [[C1]]
+    ; CHECK-NEXT: $x0 = COPY [[UBFX]](s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_CONSTANT i64 1080863910568919040
+    %2:_(s64) = G_CONSTANT i64 56
+    %3:_(s64) = G_AND %0, %1
+    %4:_(s64) = G_LSHR %3, %2
+    $x0 = COPY %4(s64)
+...
+---
+name:            no_mask_extract_unsigned_128
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: no_mask_extract_unsigned_128
+    ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 1080863910568919040
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s128) = G_CONSTANT i128 56
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s128) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s128) = G_LSHR [[AND]], [[C1]](s128)
+    ; CHECK-NEXT: $q0 = COPY [[LSHR]](s128)
+    %0:_(s128) = COPY $q0
+    %1:_(s128) = G_CONSTANT i128 1080863910568919040
+    %2:_(s128) = G_CONSTANT i128 56
+    %3:_(s128) = G_AND %0, %1
+    %4:_(s128) = G_LSHR %3, %2
+    $q0 = COPY %4(s128)
+...
+---
+name:            mask_extract_asr
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: mask_extract_asr
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]]
+    ; CHECK-NEXT: $w0 = COPY [[UBFX]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1610612736
+    %2:_(s32) = G_CONSTANT i32 29
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_ASHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:            no_mask_extract_asr
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: no_mask_extract_asr
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1073741824
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[ASHR]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 3221225472
+    %2:_(s32) = G_CONSTANT i32 30
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_ASHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:            mask_extract_signed_nonneg
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: mask_extract_signed_nonneg
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]]
+    ; CHECK-NEXT: $w0 = COPY [[UBFX]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 2147483647
+    %2:_(s32) = G_CONSTANT i32 29
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_ASHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:            no_mask_extract_large_shift
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: no_mask_extract_large_shift
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 33
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 12
+    %2:_(s32) = G_CONSTANT i32 33
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_LSHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:            no_mask_extract_negative_shift
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: no_mask_extract_negative_shift
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 12
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_LSHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:            no_mask_extract_disjoint
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: no_mask_extract_disjoint
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 26
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 26
+    %2:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_LSHR %3, %2
+    $w0 = COPY %4(s32)
+...
+---
+name:           no_mask_extract_extra_bits
+legalized: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: no_mask_extract_extra_bits
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 25
+    %2:_(s32) = G_CONSTANT i32 2
+    %3:_(s32) = G_AND %0, %1
+    %4:_(s32) = G_LSHR %3, %2
+    $w0 = COPY %4(s32)
+...

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
index 8c05f36722f27..80e7ab94575f8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll
@@ -277,9 +277,8 @@ define dso_local i32 @load_between_stores(i32 %x, i16* %p, i32 *%ptr) {
 define dso_local void @invalid_shift(i16 %x, i8* %p) {
 ; CHECK-LABEL: invalid_shift:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    ubfx w8, w0, #4, #12
 ; CHECK-NEXT:    strb w0, [x1]
-; CHECK-NEXT:    lsr w8, w8, #4
 ; CHECK-NEXT:    strb w8, [x1, #1]
 ; CHECK-NEXT:    ret
   %t1 = trunc i16 %x to i8
@@ -316,9 +315,8 @@ define dso_local void @missing_store(i32 %x, i8* %p) {
 define dso_local void @
diff erent_base_reg(i16 %x, i8* %p, i8 *%p2) {
 ; CHECK-LABEL: 
diff erent_base_reg:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    ubfx w8, w0, #8, #8
 ; CHECK-NEXT:    strb w0, [x1]
-; CHECK-NEXT:    lsr w8, w8, #8
 ; CHECK-NEXT:    strb w8, [x2, #1]
 ; CHECK-NEXT:    ret
   %t1 = trunc i16 %x to i8
@@ -333,9 +331,8 @@ define dso_local void @
diff erent_base_reg(i16 %x, i8* %p, i8 *%p2) {
 define dso_local void @second_store_is_volatile(i16 %x, i8* %p) {
 ; CHECK-LABEL: second_store_is_volatile:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    ubfx w8, w0, #8, #8
 ; CHECK-NEXT:    strb w0, [x1]
-; CHECK-NEXT:    lsr w8, w8, #8
 ; CHECK-NEXT:    strb w8, [x1, #1]
 ; CHECK-NEXT:    ret
   %t1 = trunc i16 %x to i8

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 3e70221e1a690..1a18d7f138cd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -392,8 +392,7 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
 ; GFX7-LABEL: s_bswap_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_lshl_b32 s1, s0, 8
-; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -427,8 +426,7 @@ define i16 @v_bswap_i16(i16 %src) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -459,14 +457,12 @@ define i16 @v_bswap_i16(i16 %src) {
 define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
 ; GFX7-LABEL: s_bswap_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s3, 0xffff
+; GFX7-NEXT:    s_mov_b32 s3, 0x80008
 ; GFX7-NEXT:    s_lshl_b32 s2, s0, 8
-; GFX7-NEXT:    s_and_b32 s0, s0, s3
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, s3
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_lshl_b32 s2, s1, 8
-; GFX7-NEXT:    s_and_b32 s1, s1, s3
-; GFX7-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, s3
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x100000
@@ -505,8 +501,7 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -541,8 +536,7 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -579,14 +573,11 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; GFX7-LABEL: v_bswap_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_bfe_u32 v1, v1, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 8ec1cc5a8fa32..d0aec6ca18531 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -687,8 +687,8 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
-; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -704,8 +704,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
-; VI-NEXT:    v_cvt_f32_ubyte1_e32 v2, v0
+; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index eea38ac3b061f..3db1fa678de42 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -11,10 +11,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_movk_i32 s3, 0x7f
 ; GFX6-NEXT:    s_and_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s1, s1, s3
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x60001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, -7, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -142,6 +141,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 6
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, -7, v3
@@ -149,8 +149,6 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7f
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 7
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 7, v2
@@ -267,10 +265,9 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX6-LABEL: s_fshl_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_and_b32 s3, s2, 7
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x70001
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -321,9 +318,8 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -375,9 +371,8 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-LABEL: s_fshl_i8_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x40004
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -415,9 +410,8 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshl_i8_4:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -455,9 +449,8 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
 define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-LABEL: s_fshl_i8_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 3
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x50003
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -495,9 +488,8 @@ define i8 @v_fshl_i8_5(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshl_i8_5:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 3, 5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -536,13 +528,11 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-LABEL: s_fshl_v2i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s5, s2, 7
-; GFX6-NEXT:    s_movk_i32 s6, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
-; GFX6-NEXT:    s_and_b32 s5, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s4, s2, 8
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
+; GFX6-NEXT:    s_bfe_u32 s5, s1, 0x70001
 ; GFX6-NEXT:    s_lshr_b32 s2, s5, s2
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
@@ -551,6 +541,7 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s4
+; GFX6-NEXT:    s_movk_i32 s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    s_and_b32 s1, s1, s6
 ; GFX6-NEXT:    s_and_b32 s0, s0, s6
@@ -656,15 +647,13 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-LABEL: v_fshl_v2i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 7, v2
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX6-NEXT:    v_and_b32_e32 v5, 7, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
-; GFX6-NEXT:    v_and_b32_e32 v5, s4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
+; GFX6-NEXT:    v_bfe_u32 v5, v1, 1, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v4
@@ -675,8 +664,9 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -772,17 +762,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-LABEL: s_fshl_v4i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s9, s2, 7
-; GFX6-NEXT:    s_movk_i32 s10, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX6-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s9
-; GFX6-NEXT:    s_and_b32 s9, s1, s10
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 8
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX6-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX6-NEXT:    s_lshr_b32 s9, s9, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s9
+; GFX6-NEXT:    s_bfe_u32 s9, s1, 0x70001
 ; GFX6-NEXT:    s_lshr_b32 s2, s9, s2
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s6, 7
@@ -797,6 +785,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_bfe_u32 s4, s1, 0x80010
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s7
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_movk_i32 s10, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s6
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_and_b32 s4, s8, 7
@@ -995,14 +984,13 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX6-NEXT:    v_and_b32_e32 v9, 7, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xff, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 1, v10
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v9, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v10
+; GFX6-NEXT:    v_bfe_u32 v9, v1, 1, 7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
@@ -1018,23 +1006,23 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_bfe_u32 v4, v1, 16, 8
 ; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v8
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v8
 ; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
+; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v9
+; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v9
+; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1215,8 +1203,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    s_and_b32 s2, s2, s3
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_and_b32 s1, s1, s3
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -1247,8 +1234,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    s_and_b32 s2, s2, s3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -1279,8 +1265,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
@@ -1305,10 +1290,9 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX10-NEXT:    s_mov_b32 s3, 0xffffff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    s_and_b32 s1, s1, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
@@ -1342,6 +1326,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v3
@@ -1349,8 +1334,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffff
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v2
@@ -1374,6 +1357,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX8-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v4, v4, v3
@@ -1381,8 +1365,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffff
-; GFX8-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v2
@@ -1406,6 +1388,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
@@ -1413,8 +1396,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v2
@@ -1436,16 +1417,14 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v4, 0xffffffe8, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffffff
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, v1, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
@@ -1454,10 +1433,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, v3, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffff
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 23, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt)
@@ -1998,41 +1978,40 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v9
+; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, 0xffffff
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
-; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX6-NEXT:    v_and_b32_e32 v5, v5, v9
 ; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX6-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, v8, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v7, v8
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX6-NEXT:    v_mul_hi_u32 v6, v8, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, v5, v9
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
-; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GFX6-NEXT:    v_and_b32_e32 v6, v7, v9
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, v6, v9
+; GFX6-NEXT:    v_mul_lo_u32 v6, v7, 24
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, v5, 24
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2042,8 +2021,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 23, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v9
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 23
 ; GFX6-NEXT:    v_and_b32_e32 v3, v4, v9
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2055,41 +2033,40 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX8-NEXT:    v_mul_lo_u32 v8, v7, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v8, v9
+; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0xffffff
-; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
-; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX8-NEXT:    v_and_b32_e32 v5, v5, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
+; GFX8-NEXT:    v_mul_hi_u32 v7, v8, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v7, v8
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX8-NEXT:    v_mul_hi_u32 v6, v8, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, v5, v9
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, v7, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, v6, v9
+; GFX8-NEXT:    v_mul_lo_u32 v6, v7, 24
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_lo_u32 v5, v5, 24
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2099,8 +2076,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 23, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_bfe_u32 v2, v3, 1, 23
 ; GFX8-NEXT:    v_and_b32_e32 v3, v4, v9
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2118,7 +2094,9 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GFX9-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v9
@@ -2127,25 +2105,23 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v5, v5, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v8, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
-; GFX9-NEXT:    v_and_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_add_u32_e32 v6, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
+; GFX9-NEXT:    v_and_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v7
+; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2153,8 +2129,6 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 23, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, v3, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v4, v4, v9
 ; GFX9-NEXT:    v_and_b32_e32 v2, v2, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
@@ -2169,13 +2143,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffffff
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX10-NEXT:    v_and_b32_e32 v5, v5, v10
-; GFX10-NEXT:    v_and_b32_e32 v2, v2, v10
-; GFX10-NEXT:    v_and_b32_e32 v3, v3, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
@@ -2773,9 +2745,8 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s3, s2, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
@@ -2830,9 +2801,8 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 define amdgpu_ps i16 @s_fshl_i16_4(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-LABEL: s_fshl_i16_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 12
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x4000c
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -2872,9 +2842,8 @@ define amdgpu_ps i16 @s_fshl_i16_4(i16 inreg %lhs, i16 inreg %rhs) {
 define amdgpu_ps i16 @s_fshl_i16_5(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-LABEL: s_fshl_i16_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 11
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x5000b
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -2918,9 +2887,8 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_bfe_u32 v3, v3, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
@@ -2971,9 +2939,8 @@ define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshl_i16_4:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 12, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 12, 4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3009,9 +2976,8 @@ define i16 @v_fshl_i16_5(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshl_i16_5:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 11, 5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3048,11 +3014,10 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s1, 0xf0001
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -3106,9 +3071,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s2, s1, 15
 ; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 1, 15
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s1, v0
@@ -3157,9 +3121,8 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s2, s1, 15
 ; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3214,20 +3177,18 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s6, s4, 15
 ; GFX6-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX6-NEXT:    s_and_b32 s2, s2, s6
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s2, s2, s6
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
-; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s3, s6
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s3, s6
 ; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
@@ -3328,22 +3289,19 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v6, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v3, v4, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -3409,14 +3367,15 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_u32 s4, 4, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 12, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s4, 11, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s4, v2
+; GFX6-NEXT:    s_bfe_u32 s4, 8, 0x100000
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    s_bfe_u32 s5, 8, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s4, 7, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3462,22 +3421,20 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX6-LABEL: v_fshl_v2i16_ssv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT:    s_and_b32 s2, s2, s0
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
+; GFX6-NEXT:    s_mov_b32 s0, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s2, s2, s0
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT:    s_and_b32 s0, s3, s0
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s3, s0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s1, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
@@ -3554,21 +3511,18 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX6-LABEL: v_fshl_v2i16_svs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s4, s2, 15
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 1, 15
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, s0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 15
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
@@ -3641,20 +3595,18 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s4, s2, 15
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX6-NEXT:    s_mov_b32 s4, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s0, s0, s4
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
-; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_and_b32 s0, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s1, s4
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
@@ -3757,38 +3709,34 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_and_b32 s12, s8, 15
 ; GFX6-NEXT:    s_bfe_u32 s12, s12, 0x100000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX6-NEXT:    s_mov_b32 s12, 0xffff
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
-; GFX6-NEXT:    s_and_b32 s4, s4, s12
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX6-NEXT:    s_mov_b32 s12, 0xf0001
+; GFX6-NEXT:    s_bfe_u32 s4, s4, s12
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_and_b32 s4, s9, 15
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s9
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s4, s5, s12
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s5, s12
 ; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s4, s10, 15
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s10
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s4
-; GFX6-NEXT:    s_and_b32 s4, s6, s12
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s6, s12
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    s_and_b32 s4, s11, 15
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s11
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s4
-; GFX6-NEXT:    s_and_b32 s4, s7, s12
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s7, s12
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x100000
@@ -3957,43 +3905,37 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v12, 15, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX6-NEXT:    v_bfe_u32 v12, v12, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v12, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v9
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v9
-; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v5, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v5, v8, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v10
-; GFX6-NEXT:    v_mov_b32_e32 v12, 0xffff
 ; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v10
-; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, v6, v12
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v6, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v11
 ; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v11
-; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v7, v12
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v7, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index f76ccdf2f2d2e..766d9cdc60e60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -370,9 +370,8 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-LABEL: s_fshr_i8_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x40004
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -410,9 +409,8 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshr_i8_4:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -450,9 +448,8 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
 define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX6-LABEL: s_fshr_i8_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x30005
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -490,9 +487,8 @@ define i8 @v_fshr_i8_5(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshr_i8_5:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 5, 3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2657,9 +2653,8 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 define amdgpu_ps i16 @s_fshr_i16_4(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-LABEL: s_fshr_i16_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 12
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 4
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0xc0004
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -2699,9 +2694,8 @@ define amdgpu_ps i16 @s_fshr_i16_4(i16 inreg %lhs, i16 inreg %rhs) {
 define amdgpu_ps i16 @s_fshr_i16_5(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX6-LABEL: s_fshr_i16_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 11
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 5
+; GFX6-NEXT:    s_bfe_u32 s1, s1, 0xb0005
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -2798,9 +2792,8 @@ define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshr_i16_4:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 4, 12
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2836,9 +2829,8 @@ define i16 @v_fshr_i16_5(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshr_i16_5:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 11, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 5, 11
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3036,38 +3028,37 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s4, s4, s6
+; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s5, s4
 ; GFX6-NEXT:    s_bfe_u32 s5, 1, 0x100000
+; GFX6-NEXT:    s_mov_b32 s6, 0xf0001
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s5
-; GFX6-NEXT:    s_and_b32 s7, s2, s6
+; GFX6-NEXT:    s_bfe_u32 s7, s2, s6
+; GFX6-NEXT:    s_bfe_u32 s8, 14, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s5
-; GFX6-NEXT:    s_and_b32 s5, s3, s6
-; GFX6-NEXT:    s_lshr_b32 s7, s7, 15
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 15
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX6-NEXT:    s_bfe_u32 s5, s3, s6
+; GFX6-NEXT:    s_lshr_b32 s7, s7, s8
+; GFX6-NEXT:    s_lshr_b32 s5, s5, s8
 ; GFX6-NEXT:    s_xor_b32 s4, s4, -1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s7
 ; GFX6-NEXT:    s_or_b32 s1, s1, s5
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX6-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX6-NEXT:    s_and_b32 s7, s4, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX6-NEXT:    s_and_b32 s2, s2, s6
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s2, s6
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s7
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
+; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s3, s6
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX6-NEXT:    s_bfe_u32 s2, s3, s6
 ; GFX6-NEXT:    s_bfe_u32 s3, s4, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
@@ -3176,41 +3167,38 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v6
-; GFX6-NEXT:    s_mov_b32 s5, 0xffff
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v5, s5, v2
+; GFX6-NEXT:    v_bfe_u32 v5, v2, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s5, 14, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, s5, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX6-NEXT:    v_and_b32_e32 v5, s5, v3
+; GFX6-NEXT:    v_bfe_u32 v5, v3, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 15, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, s5, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v7, 15, v4
+; GFX6-NEXT:    v_and_b32_e32 v6, 15, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX6-NEXT:    v_bfe_u32 v7, v7, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v6, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, v3, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v3, v4, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -3286,14 +3274,15 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_u32 s4, 12, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 4, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s4, 3, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s4, v2
+; GFX6-NEXT:    s_bfe_u32 s4, 8, 0x100000
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    s_bfe_u32 s5, 8, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s5, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s4, 7, 0x100000
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3341,36 +3330,35 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_mov_b32 s5, 0xffff
+; GFX6-NEXT:    s_mov_b32 s5, 0xf0001
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
-; GFX6-NEXT:    s_and_b32 s6, s2, s5
+; GFX6-NEXT:    s_bfe_u32 s6, s2, s5
+; GFX6-NEXT:    s_bfe_u32 s7, 14, 0x100000
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX6-NEXT:    s_lshr_b32 s6, s6, 15
+; GFX6-NEXT:    s_lshr_b32 s6, s6, s7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_and_b32 s0, s2, s5
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s2, s5
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s4, s3, s5
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s3, s5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 15
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s7
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT:    s_and_b32 s0, s3, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s3, s5
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s1, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
@@ -3456,38 +3444,36 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s4
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v0
+; GFX6-NEXT:    v_bfe_u32 v2, v0, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s4, 14, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_bfe_u32 v3, v1, 1, 15
 ; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, s3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 15, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, s4, v3
 ; GFX6-NEXT:    v_or_b32_e32 v3, s0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_xor_b32 s0, s2, -1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_and_b32 s2, s0, 15
 ; GFX6-NEXT:    s_andn2_b32 s0, 15, s0
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 1, 15
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s1, 15
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
 ; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s0, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 15
 ; GFX6-NEXT:    s_bfe_u32 s0, s1, 0x100000
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3573,38 +3559,37 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s4
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
 ; GFX6-NEXT:    s_bfe_u32 s3, 1, 0x100000
+; GFX6-NEXT:    s_mov_b32 s4, 0xf0001
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s3, v0
-; GFX6-NEXT:    s_and_b32 s5, s0, s4
+; GFX6-NEXT:    s_bfe_u32 s5, s0, s4
+; GFX6-NEXT:    s_bfe_u32 s6, 14, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s3, v1
-; GFX6-NEXT:    s_and_b32 s3, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 15
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 15
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s3, s1, s4
+; GFX6-NEXT:    s_lshr_b32 s5, s5, s6
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s6
 ; GFX6-NEXT:    s_xor_b32 s2, s2, -1
 ; GFX6-NEXT:    v_or_b32_e32 v0, s5, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX6-NEXT:    s_and_b32 s5, s2, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s0, s4
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s5, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_and_b32 s0, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX6-NEXT:    s_bfe_u32 s0, s1, s4
 ; GFX6-NEXT:    s_bfe_u32 s1, s2, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
@@ -3715,35 +3700,35 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_or_b32 s8, s9, s8
 ; GFX6-NEXT:    s_lshl_b32 s9, s11, 16
 ; GFX6-NEXT:    s_and_b32 s10, s10, s12
+; GFX6-NEXT:    s_mov_b32 s11, 0xf0001
 ; GFX6-NEXT:    s_or_b32 s9, s9, s10
 ; GFX6-NEXT:    s_bfe_u32 s10, 1, 0x100000
-; GFX6-NEXT:    s_and_b32 s11, s4, s12
+; GFX6-NEXT:    s_bfe_u32 s12, s4, s11
+; GFX6-NEXT:    s_bfe_u32 s13, 14, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s10
-; GFX6-NEXT:    s_lshr_b32 s11, s11, 15
-; GFX6-NEXT:    s_or_b32 s0, s0, s11
-; GFX6-NEXT:    s_and_b32 s11, s5, s12
+; GFX6-NEXT:    s_lshr_b32 s12, s12, s13
+; GFX6-NEXT:    s_or_b32 s0, s0, s12
+; GFX6-NEXT:    s_bfe_u32 s12, s5, s11
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s10
-; GFX6-NEXT:    s_lshr_b32 s11, s11, 15
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s12, s12, s13
 ; GFX6-NEXT:    s_xor_b32 s8, s8, -1
-; GFX6-NEXT:    s_or_b32 s1, s1, s11
-; GFX6-NEXT:    s_lshr_b32 s11, s8, 16
-; GFX6-NEXT:    s_and_b32 s13, s8, 15
+; GFX6-NEXT:    s_or_b32 s1, s1, s12
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s12, s8, 16
+; GFX6-NEXT:    s_and_b32 s14, s8, 15
 ; GFX6-NEXT:    s_andn2_b32 s8, 15, s8
-; GFX6-NEXT:    s_and_b32 s4, s4, s12
-; GFX6-NEXT:    s_bfe_u32 s13, s13, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s14, s14, 0x100000
+; GFX6-NEXT:    s_bfe_u32 s4, s4, s11
 ; GFX6-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s13
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s14
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s4
-; GFX6-NEXT:    s_and_b32 s4, s11, 15
+; GFX6-NEXT:    s_and_b32 s4, s12, 15
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX6-NEXT:    s_andn2_b32 s8, 15, s12
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT:    s_andn2_b32 s8, 15, s11
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s4, s5, s12
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u32 s4, s5, s11
 ; GFX6-NEXT:    s_bfe_u32 s5, s8, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
@@ -3752,32 +3737,30 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, s10
-; GFX6-NEXT:    s_and_b32 s2, s6, s12
-; GFX6-NEXT:    s_lshr_b32 s2, s2, 15
+; GFX6-NEXT:    s_bfe_u32 s2, s6, s11
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s13
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, s10
-; GFX6-NEXT:    s_and_b32 s3, s7, s12
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 15
+; GFX6-NEXT:    s_bfe_u32 s3, s7, s11
+; GFX6-NEXT:    s_lshr_b32 s3, s3, s13
+; GFX6-NEXT:    s_xor_b32 s5, s9, -1
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s3, s6, 1
-; GFX6-NEXT:    s_xor_b32 s5, s9, -1
 ; GFX6-NEXT:    s_lshl_b32 s4, s7, 1
 ; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
 ; GFX6-NEXT:    s_and_b32 s7, s5, 15
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s5
-; GFX6-NEXT:    s_and_b32 s3, s3, s12
 ; GFX6-NEXT:    s_bfe_u32 s7, s7, 0x100000
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s3, s3, s11
 ; GFX6-NEXT:    s_bfe_u32 s5, s5, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, s7
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 15
-; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX6-NEXT:    s_andn2_b32 s5, 15, s6
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x100000
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s4, s12
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX6-NEXT:    s_bfe_u32 s3, s4, s11
 ; GFX6-NEXT:    s_bfe_u32 s4, s5, 0x100000
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
@@ -3967,48 +3950,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_or_b32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX6-NEXT:    v_and_b32_e32 v10, v10, v12
-; GFX6-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX6-NEXT:    s_bfe_u32 s4, 1, 0x100000
-; GFX6-NEXT:    v_and_b32_e32 v10, s5, v4
+; GFX6-NEXT:    v_bfe_u32 v10, v4, 1, 15
+; GFX6-NEXT:    s_bfe_u32 s5, 14, 0x100000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 15, v10
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, s5, v10
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX6-NEXT:    v_and_b32_e32 v10, s5, v5
+; GFX6-NEXT:    v_bfe_u32 v10, v5, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 15, v10
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, s5, v10
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v10
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
 ; GFX6-NEXT:    v_and_b32_e32 v11, 15, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX6-NEXT:    v_bfe_u32 v11, v11, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v11, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v10
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v10
-; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
-; GFX6-NEXT:    v_and_b32_e32 v4, v5, v12
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v5, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v5, v8, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, v6, v12
+; GFX6-NEXT:    v_bfe_u32 v4, v6, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 15, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, s5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, v7, v12
+; GFX6-NEXT:    v_bfe_u32 v4, v7, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, s4, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 15, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, s5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v9
@@ -4017,20 +3998,18 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT:    v_and_b32_e32 v4, v4, v12
 ; GFX6-NEXT:    v_bfe_u32 v8, v8, 0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v6, v6, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v8, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v7
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v7
-; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT:    v_bfe_u32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v5, v12
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT:    v_bfe_u32 v4, v5, 1, 15
 ; GFX6-NEXT:    v_bfe_u32 v5, v6, 0, 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index a87e8f8e3a27a..900f53ba38713 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -1020,8 +1020,7 @@ define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s0, s0, 0x1c0
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 6
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x30006
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -1039,8 +1038,7 @@ define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s0, s0, 0x1ff
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 6
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x30006
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 4b3fc2e1a6102..d0e71fb9ecc25 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -42,8 +42,7 @@ define i8 @v_lshr_i8_7(i8 %value) {
 ; GFX6-LABEL: v_lshr_i8_7:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 7, 1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i8_7:
@@ -110,14 +109,12 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
 define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
 ; GCN-LABEL: s_lshr_i8_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_lshr_b32 s0, s0, 7
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x10007
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_lshr_i8_7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x10007
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, 7
   ret i8 %result
@@ -151,16 +148,14 @@ define i24 @v_lshr_i24_7(i24 %value) {
 ; GCN-LABEL: v_lshr_i24_7:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
+; GCN-NEXT:    v_bfe_u32 v0, v0, 7, 17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i24_7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
+; GFX10-NEXT:    v_bfe_u32 v0, v0, 7, 17
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i24 %value, 7
   ret i24 %result
@@ -189,14 +184,12 @@ define amdgpu_ps i24 @s_lshr_i24(i24 inreg %value, i24 inreg %amount) {
 define amdgpu_ps i24 @s_lshr_i24_7(i24 inreg %value) {
 ; GCN-LABEL: s_lshr_i24_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffffff
-; GCN-NEXT:    s_lshr_b32 s0, s0, 7
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x110007
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_lshr_i24_7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xffffff
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x110007
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = lshr i24 %value, 7
   ret i24 %result
@@ -664,14 +657,12 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
 define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
 ; GCN-LABEL: s_lshr_i16_15:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NEXT:    s_lshr_b32 s0, s0, 15
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1000f
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_lshr_i16_15:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 15
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x1000f
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = lshr i16 %value, 15
   ret i16 %result
@@ -774,11 +765,8 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_lshr_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 15, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 15, v1
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 15, 1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16_15:


        


More information about the llvm-commits mailing list