[llvm] 40e269e - [GlobalISel] Add a combine for ashr(shl x, c), c --> sext_inreg x, c'

Tue Aug 18 10:42:26 PDT 2020

Author: Amara Emerson
Date: 2020-08-18T10:42:15-07:00
New Revision: 40e269ea6db9c755c27e2ee1e201a640ac085afd

URL: https://github.com/llvm/llvm-project/commit/40e269ea6db9c755c27e2ee1e201a640ac085afd
DIFF: https://github.com/llvm/llvm-project/commit/40e269ea6db9c755c27e2ee1e201a640ac085afd.diff

LOG: [GlobalISel] Add a combine for ashr(shl x, c), c --> sext_inreg x, c'

By detecting this sign extend pattern early, we can uncover opportunities for
more optimizations.

Differential Revision: https://reviews.llvm.org/D85965

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index e632f5fd05ec..e5f2700f6de9 100644

--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -294,6 +294,12 @@ class CombinerHelper {
   bool applyBuildInstructionSteps(MachineInstr &MI,
                                   InstructionStepsMatchInfo &MatchInfo);
 
+  /// Match ashr (shl x, C), C -> sext_inreg (C)
+  bool matchAshrShlToSextInreg(MachineInstr &MI,
+                               std::tuple<Register, int64_t> &MatchInfo);
+  bool applyAshShlToSextInreg(MachineInstr &MI,
+                              std::tuple<Register, int64_t> &MatchInfo);
+
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 043be086ff41..4e216a284088 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -251,6 +251,12 @@ m_GLShr(const LHS &L, const RHS &R) {
   return BinaryOp_match<LHS, RHS, TargetOpcode::G_LSHR, false>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>
+m_GAShr(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
+}
+
 // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
 template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
   SrcTy L;

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9cb45e2bfc11..4647afad4185 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -284,6 +284,15 @@ def hoist_logic_op_with_same_opcode_hands: GICombineRule <
   (apply [{ return Helper.applyBuildInstructionSteps(*${root}, ${info});}])
 >;
 
+// Fold ashr (shl x, C), C -> sext_inreg (C)
+def shl_ashr_to_sext_inreg_matchinfo : GIDefMatchData<"std::tuple<Register, int64_t>">;
+def shl_ashr_to_sext_inreg : GICombineRule<
+  (defs root:$root, shl_ashr_to_sext_inreg_matchinfo:$info),
+  (match (wip_match_opcode G_ASHR): $root,
+    [{ return Helper.matchAshrShlToSextInreg(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyAshShlToSextInreg(*${root}, ${info});}])
+>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -301,4 +310,5 @@ def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>;
 def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
     combines_for_extload, combine_indexed_load_store, undef_combines,
     identity_combines, simplify_add_to_sub,
-    hoist_logic_op_with_same_opcode_hands]>;
+    hoist_logic_op_with_same_opcode_hands,
+    shl_ashr_to_sext_inreg]>;

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b922f6988a2c..48294a07597f 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1887,6 +1887,36 @@ bool CombinerHelper::applyBuildInstructionSteps(
   return true;
 }
 
+bool CombinerHelper::matchAshrShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  int64_t ShlCst, AshrCst;
+  Register Src;
+  // FIXME: detect splat constant vectors.
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GAShr(m_GShl(m_Reg(Src), m_ICst(ShlCst)), m_ICst(AshrCst))))
+    return false;
+  if (ShlCst != AshrCst)
+    return false;
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SEXT_INREG, {MRI.getType(Src)}}))
+    return false;
+  MatchInfo = {Src, ShlCst};
+  return true;
+}
+bool CombinerHelper::applyAshShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  Register Src;
+  int64_t ShiftAmt;
+  std::tie(Src, ShiftAmt) = MatchInfo;
+  unsigned Size = MRI.getType(Src).getScalarSizeInBits();
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildSExtInReg(MI.getOperand(0).getReg(), Src, Size - ShiftAmt);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir
new file mode 100644
index 000000000000..14bda863d2c2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            ashr_shl_to_sext_inreg
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: ashr_shl_to_sext_inreg
+    ; CHECK: liveins: $w0
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s16) = G_SEXT_INREG [[TRUNC]], 8
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SEXT_INREG]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(s32) = COPY $w0
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_CONSTANT i16 8
+    %3:_(s16) = G_SHL %0, %2(s16)
+    %4:_(s16) = exact G_ASHR %3, %2(s16)
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $w0 = COPY %5(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            
diff erent_shift_amts
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: 
diff erent_shift_amts
+    ; CHECK: liveins: $w0
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 12
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
+    ; CHECK: [[ASHR:%[0-9]+]]:_(s16) = exact G_ASHR [[SHL]], [[C1]](s16)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(s32) = COPY $w0
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_CONSTANT i16 12
+    %4:_(s16) = G_CONSTANT i16 8
+    %3:_(s16) = G_SHL %0, %2(s16)
+    %5:_(s16) = exact G_ASHR %3, %4(s16)
+    %6:_(s32) = G_ANYEXT %5(s16)
+    $w0 = COPY %6(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            ashr_shl_to_sext_inreg_vector
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+    ; Currently don't support this for vectors just yet, this will need updating
+    ; when we do.
+    ; CHECK-LABEL: name: ashr_shl_to_sext_inreg_vector
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+    ; CHECK: [[SHL:%[0-9]+]]:_(<4 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK: [[ASHR:%[0-9]+]]:_(<4 x s16>) = exact G_ASHR [[SHL]], [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK: $d0 = COPY [[ASHR]](<4 x s16>)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:_(<4 x s16>) = COPY $d0
+    %2:_(s16) = G_CONSTANT i16 8
+    %1:_(<4 x s16>) = G_BUILD_VECTOR %2(s16), %2(s16), %2(s16), %2(s16)
+    %3:_(<4 x s16>) = G_SHL %0, %1(<4 x s16>)
+    %4:_(<4 x s16>) = exact G_ASHR %3, %1(<4 x s16>)
+    $d0 = COPY %4(<4 x s16>)
+    RET_ReallyLR implicit $d0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index e5d26476e942..f3a53fb7d22d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -674,8 +674,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrs
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -830,8 +829,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_add_i32 s2, s2, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s2, 0x80000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -854,8 +852,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %ou
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_add_i32 s2, s2, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s2, 8
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -879,8 +876,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 add
 ; GFX6-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
   %load = load i8, i8 addrspace(1)* %ptr, align 1
@@ -904,8 +900,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 a
 ; GFX6-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 8, 0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
   %load = load i8, i8 addrspace(1)* %ptr, align 1
@@ -927,8 +922,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -951,8 +945,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 30
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10001
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -975,8 +968,7 @@ define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 30
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20001
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index ab3fbc03e81d..a8098b7dd9d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -423,8 +423,7 @@ define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x10000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -950,22 +949,22 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out)
 define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
 ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0xb
-; GFX6-NEXT:        s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX6-NEXT:        s_mov_b32 s6, -1
-; GFX6-NEXT:        s_mov_b32 s7, 0xf000
-; GFX6-NEXT:        s_mov_b64 s[10:11], s[6:7]
-; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:        s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:        s_and_b32 s0, s0, 63
-; GFX6-NEXT:        s_bfe_u32 s1, s0, 0x20002
-; GFX6-NEXT:        v_mov_b32_e32 v1, s1
-; GFX6-NEXT:        v_mov_b32_e32 v0, s0
-; GFX6-NEXT:        buffer_store_dword v1, off, s[4:7], 0
-; GFX6-NEXT:        buffer_store_dword v0, off, s[8:11], 0
-; GFX6-NEXT:        s_endpgm
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_and_b32 s0, s0, 63
+; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x20002
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX6-NEXT:    s_endpgm
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index f6565fe1b6e2..db9e75dd582c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -3415,8 +3415,7 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
@@ -3736,10 +3735,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v3|
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; CGP-NEXT:    v_lshlrev_b32_e32 v2, 7, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 7, v2
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; CGP-NEXT:    v_bfe_i32 v2, v2, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 06d46321a59b..7f55c7358597 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3363,8 +3363,7 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
@@ -3677,20 +3676,18 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_rcp_f32_e32 v5, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 30, v6
 ; CGP-NEXT:    v_or_b32_e32 v6, 1, v6
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_mul_f32_e32 v5, v1, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v5
 ; CGP-NEXT:    v_mad_f32 v1, -v5, v4, v1
 ; CGP-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v4|
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; CGP-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v2, 7, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 7, v2
+; CGP-NEXT:    v_bfe_i32 v2, v2, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>