[llvm] 70cd9f5 - AMDGPU/GlobalISel: Start implementing computeKnownBitsForTargetInstr

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 24 06:53:33 PDT 2020


Author: Matt Arsenault
Date: 2020-08-24T09:53:27-04:00
New Revision: 70cd9f5b779c04d1b32c790cb289c9f00f548b57

URL: https://github.com/llvm/llvm-project/commit/70cd9f5b779c04d1b32c790cb289c9f00f548b57
DIFF: https://github.com/llvm/llvm-project/commit/70cd9f5b779c04d1b32c790cb289c9f00f548b57.diff

LOG: AMDGPU/GlobalISel: Start implementing computeKnownBitsForTargetInstr

Handle workitem intrinsics. There isn't really away to adequately test
this right now, since none of the known bits users are fine grained
enough to test the edge conditions. This triggers a number of
instances of the new 64-bit to 32-bit shift combine in the existing
tests.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 3698881ffb7b..c03d24016cac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -438,6 +438,21 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   return Requested;
 }
 
+static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+  auto Node = Kernel.getMetadata("reqd_work_group_size");
+  if (Node && Node->getNumOperands() == 3)
+    return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
+  return std::numeric_limits<unsigned>::max();
+}
+
+unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
+                                           unsigned Dimension) const {
+  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+  if (ReqdSize != std::numeric_limits<unsigned>::max())
+    return ReqdSize - 1;
+  return getFlatWorkGroupSizes(Kernel).second - 1;
+}
+
 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   Function *Kernel = I->getParent()->getParent();
   unsigned MinSize = 0;
@@ -474,11 +489,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
       default:
         break;
       }
+
       if (Dim <= 3) {
-        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
-          if (Node->getNumOperands() == 3)
-            MinSize = MaxSize = mdconst::extract<ConstantInt>(
-                                  Node->getOperand(Dim))->getZExtValue();
+        unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+        if (ReqdSize != std::numeric_limits<unsigned>::max())
+          MinSize = MaxSize = ReqdSize;
       }
     }
   }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d5fca0313d75..97b15bac774a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -239,6 +239,10 @@ class AMDGPUSubtarget {
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
 
+  /// Return the maximum workitem ID value in the function, for the given (0, 1,
+  /// 2) dimension.
+  unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
+
   /// Creates value range metadata on an workitemid.* inrinsic call or load.
   bool makeLIDRangeMetadata(Instruction *I) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index db117706812d..dac9bdf2fb7c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11395,6 +11395,42 @@ void SITargetLowering::computeKnownBitsForFrameIndex(
   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
 }
 
+static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
+                                   KnownBits &Known, unsigned Dim) {
+  unsigned MaxValue =
+      ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
+  Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+}
+
+void SITargetLowering::computeKnownBitsForTargetInstr(
+    GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
+    const MachineRegisterInfo &MRI, unsigned Depth) const {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case AMDGPU::G_INTRINSIC: {
+    switch (MI->getIntrinsicID())
+    case Intrinsic::amdgcn_workitem_id_x:
+      knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
+      break;
+    case Intrinsic::amdgcn_workitem_id_y:
+      knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
+      break;
+    case Intrinsic::amdgcn_workitem_id_z:
+      knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
+      break;
+    case Intrinsic::amdgcn_mbcnt_lo:
+    case Intrinsic::amdgcn_mbcnt_hi: {
+      // These return at most the wavefront size - 1.
+      unsigned Size = MRI.getType(R).getSizeInBits();
+      Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
+      break;
+    }
+    default:
+      break;
+  }
+  }
+}
+
 Align SITargetLowering::computeKnownAlignForTargetInstr(
   GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
   unsigned Depth) const {

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 8dfb87bdc98a..3e8220ad9db2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -420,6 +420,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void computeKnownBitsForFrameIndex(int FrameIdx,
                                      KnownBits &Known,
                                      const MachineFunction &MF) const override;
+  void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R,
+                                      KnownBits &Known,
+                                      const APInt &DemandedElts,
+                                      const MachineRegisterInfo &MRI,
+                                      unsigned Depth = 0) const override;
 
   Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R,
                                         const MachineRegisterInfo &MRI,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index d5d991288cce..37ea037ab1a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -581,8 +581,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -612,13 +612,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
@@ -688,8 +687,8 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -707,13 +706,12 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
@@ -737,8 +735,8 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -755,13 +753,12 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
@@ -830,8 +827,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -861,13 +858,12 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
@@ -904,8 +900,8 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -922,13 +918,12 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
@@ -950,8 +945,8 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -969,13 +964,12 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
@@ -999,8 +993,8 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1018,13 +1012,12 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1049,8 +1042,8 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1066,13 +1059,12 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v2, v0
@@ -1094,8 +1086,8 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a
 ; SI-LABEL: cvt_ubyte0_or_multiuse:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1114,13 +1106,12 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a
 ; VI-LABEL: cvt_ubyte0_or_multiuse:
 ; VI:       ; %bb.0: ; %bb
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 5d1468eba04e..43692dc81535 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -5,122 +5,61 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GCN-LABEL: v_insert_v64i32_37:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    v_lshlrev_b64 v[0:1], 8, v[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v64, 8, v0
+; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_add_co_u32_e32 v8, vcc, v2, v0
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    v_addc_co_u32_e32 v9, vcc, v3, v1, vcc
-; GCN-NEXT:    s_movk_i32 s0, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, v8, v2
-; GCN-NEXT:    s_movk_i32 s0, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v65, s1
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_mov_b32_e32 v64, s0
-; GCN-NEXT:    s_movk_i32 s0, 0x50
-; GCN-NEXT:    v_mov_b32_e32 v69, s1
-; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, v9, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    v_add_co_u32_e32 v66, vcc, v4, v0
-; GCN-NEXT:    v_mov_b32_e32 v68, s0
-; GCN-NEXT:    s_movk_i32 s0, 0x60
-; GCN-NEXT:    v_mov_b32_e32 v71, s1
-; GCN-NEXT:    v_addc_co_u32_e32 v67, vcc, v5, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v70, s0
-; GCN-NEXT:    s_movk_i32 s0, 0x70
-; GCN-NEXT:    v_mov_b32_e32 v73, s1
-; GCN-NEXT:    v_add_co_u32_e32 v74, vcc, v66, v2
-; GCN-NEXT:    v_mov_b32_e32 v72, s0
-; GCN-NEXT:    s_movk_i32 s0, 0x90
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_addc_co_u32_e32 v75, vcc, v67, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_add_co_u32_e32 v76, vcc, v66, v0
-; GCN-NEXT:    v_addc_co_u32_e32 v77, vcc, v67, v1, vcc
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v[12:13], off
-; GCN-NEXT:    v_add_co_u32_e32 v10, vcc, 64, v8
-; GCN-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v9, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v28, vcc, v8, v64
-; GCN-NEXT:    v_addc_co_u32_e32 v29, vcc, v9, v65, vcc
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[8:9], off
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[8:9], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[8:9], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[8:9], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[10:11], off
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[10:11], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[10:11], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[10:11], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:48
-; GCN-NEXT:    s_movk_i32 s0, 0xa0
+; GCN-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v64
+; GCN-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 64, v6
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_add_co_u32_e32 v16, vcc, v6, v4
+; GCN-NEXT:    v_addc_co_u32_e32 v17, vcc, v7, v5, vcc
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v64, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v64, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v64, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v64, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v64, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[2:3], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[16:17], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[16:17], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[16:17], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(15)
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0x3e7
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    global_store_dwordx4 v[74:75], v[0:3], off
-; GCN-NEXT:    global_store_dwordx4 v[76:77], v[4:7], off
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v0
-; GCN-NEXT:    s_movk_i32 s0, 0xb0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v1, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v64
-; GCN-NEXT:    s_movk_i32 s0, 0xd0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v65, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v2
-; GCN-NEXT:    s_movk_i32 s0, 0xe0
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[16:19], off
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v0
-; GCN-NEXT:    s_movk_i32 s0, 0xf0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[24:27], off
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 64, v66
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v67, vcc
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[36:39], off offset:-48
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[40:43], off offset:-32
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[44:47], off offset:-16
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[48:51], off
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v68
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v69, vcc
-; GCN-NEXT:    global_store_dwordx4 v[66:67], v[32:35], off
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[52:55], off
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v70
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v71, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v72
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v73, vcc
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[56:59], off
-; GCN-NEXT:    global_store_dwordx4 v[2:3], v[60:63], off
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
+; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
+; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
+; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
+; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
+; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:112
 ; GCN-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index 8817de69bdba..fc8df81dff84 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -365,19 +365,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -387,19 +386,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -442,13 +440,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
@@ -458,13 +455,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
@@ -672,19 +668,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -694,19 +689,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -749,13 +743,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
@@ -765,13 +758,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
@@ -991,19 +983,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    v_mov_b32_e32 v5, 0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1014,19 +1005,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1071,13 +1061,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1088,13 +1077,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1556,19 +1544,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    v_mov_b32_e32 v5, 0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1579,19 +1566,18 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1636,13 +1622,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1653,13 +1638,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index ce898c2a73d4..af02b77c51d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -365,19 +365,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -387,19 +386,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -409,23 +407,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
-; GFX9-NEXT:    global_atomic_inc v2, v[2:3], v4, off glc
+; GFX9-NEXT:    global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
@@ -440,13 +427,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
@@ -456,13 +442,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
@@ -472,17 +457,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 20, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off glc
+; GFX9-NEXT:    global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
@@ -925,19 +903,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    v_mov_b32_e32 v5, 0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
@@ -948,19 +925,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
@@ -971,24 +947,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 40, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -1003,13 +968,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1020,13 +984,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1037,18 +1000,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -1133,19 +1089,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1155,19 +1110,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1177,19 +1131,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1208,13 +1161,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
@@ -1224,13 +1176,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
@@ -1240,13 +1191,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 20, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
@@ -1406,19 +1356,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    v_mov_b32_e32 v5, 0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1429,19 +1378,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1452,19 +1400,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 40, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 40, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
@@ -1484,13 +1431,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1501,13 +1447,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -1518,13 +1463,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 80f86c6b1f50..a122a1b2e39d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -840,9 +840,9 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x15
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[1:2], v[0:1], 2
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
@@ -866,13 +866,12 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x54
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 8, v1
@@ -899,31 +898,24 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10_W32-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x54
+; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
-; GFX10_W32-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s6
-; GFX10_W32-NEXT:    v_mov_b32_e32 v4, s7
+; GFX10_W32-NEXT:    s_clause 0x2
+; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7]
+; GFX10_W32-NEXT:    global_load_dword v3, v1, s[6:7] offset:4
+; GFX10_W32-NEXT:    global_load_dword v1, v1, s[6:7] offset:8
 ; GFX10_W32-NEXT:    s_add_u32 s0, s4, 8
 ; GFX10_W32-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10_W32-NEXT:    v_add_co_u32_e64 v1, vcc_lo, v3, v1
 ; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10_W32-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
 ; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W32-NEXT:    v_add_co_u32_e64 v3, vcc_lo, v1, 8
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
-; GFX10_W32-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
-; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10_W32-NEXT:    s_clause 0x2
-; GFX10_W32-NEXT:    global_load_dword v1, v[1:2], off
-; GFX10_W32-NEXT:    global_load_dword v2, v[3:4], off offset:-4
-; GFX10_W32-NEXT:    global_load_dword v3, v[3:4], off
 ; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s2
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
+; GFX10_W32-NEXT:    v_div_fmas_f32 v2, v2, v3, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
@@ -932,30 +924,23 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10_W64-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x54
-; GFX10_W64-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
+; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s6
-; GFX10_W64-NEXT:    v_mov_b32_e32 v4, s7
+; GFX10_W64-NEXT:    s_clause 0x2
+; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7]
+; GFX10_W64-NEXT:    global_load_dword v3, v1, s[6:7] offset:4
+; GFX10_W64-NEXT:    global_load_dword v1, v1, s[6:7] offset:8
 ; GFX10_W64-NEXT:    s_add_u32 s0, s4, 8
 ; GFX10_W64-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX10_W64-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10_W64-NEXT:    v_add_co_u32_e64 v1, vcc, v3, v1
 ; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10_W64-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
 ; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W64-NEXT:    v_add_co_u32_e64 v3, vcc, v1, 8
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
-; GFX10_W64-NEXT:    v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc
-; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10_W64-NEXT:    s_clause 0x2
-; GFX10_W64-NEXT:    global_load_dword v1, v[1:2], off
-; GFX10_W64-NEXT:    global_load_dword v2, v[3:4], off offset:-4
-; GFX10_W64-NEXT:    global_load_dword v3, v[3:4], off
 ; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[2:3]
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
+; GFX10_W64-NEXT:    v_div_fmas_f32 v2, v2, v3, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
@@ -984,8 +969,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x13
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[1:2], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1017,14 +1002,13 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x4c
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX8-NEXT:    s_mov_b32 s2, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
@@ -1053,18 +1037,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W32:       ; %bb.0: ; %entry
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; GFX10_W32-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10_W32-NEXT:    s_mov_b32 s4, 0
 ; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
-; GFX10_W32-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    v_mov_b32_e32 v4, s3
-; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
+; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10_W32-NEXT:    v_add_co_u32_e64 v1, vcc_lo, v3, v1
-; GFX10_W32-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
-; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v[1:2], off
 ; GFX10_W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX10_W32-NEXT:    s_cbranch_execz BB13_2
 ; GFX10_W32-NEXT:  ; %bb.1: ; %bb
@@ -1075,7 +1055,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10_W32-NEXT:  BB13_2: ; %exit
-; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
@@ -1092,17 +1071,13 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W64:       ; %bb.0: ; %entry
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; GFX10_W64-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10_W64-NEXT:    s_mov_b32 s6, 0
-; GFX10_W64-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v4, s3
-; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
+; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10_W64-NEXT:    v_add_co_u32_e64 v1, vcc, v3, v1
-; GFX10_W64-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
-; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v[1:2], off
 ; GFX10_W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10_W64-NEXT:    s_cbranch_execz BB13_2
 ; GFX10_W64-NEXT:  ; %bb.1: ; %bb
@@ -1113,7 +1088,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W64-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10_W64-NEXT:  BB13_2: ; %exit
-; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10_W64-NEXT:    s_and_b32 s0, 1, s6
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index de2b58aa6fed..cd427c7d00d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -8,8 +8,8 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX7-LABEL: test_div_scale_f32_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -26,13 +26,12 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX8-LABEL: test_div_scale_f32_1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
@@ -47,21 +46,14 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX10-LABEL: test_div_scale_f32_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    global_load_dword v1, v[2:3], off
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v1, v1, v0
+; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
@@ -83,8 +75,8 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX7-LABEL: test_div_scale_f32_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -101,13 +93,12 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX8-LABEL: test_div_scale_f32_2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
@@ -122,21 +113,14 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX10-LABEL: test_div_scale_f32_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    global_load_dword v1, v[2:3], off
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v1, v0
+; GFX10-NEXT:    v_div_scale_f32 v2, s2, v1, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
@@ -158,14 +142,13 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX7-LABEL: test_div_scale_f64_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -180,14 +163,13 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX8-LABEL: test_div_scale_f64_1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -202,20 +184,13 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX10-LABEL: test_div_scale_f64_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 8
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
@@ -239,14 +214,13 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX7-LABEL: test_div_scale_f64_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -261,14 +235,13 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX8-LABEL: test_div_scale_f64_2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -283,20 +256,13 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX10-LABEL: test_div_scale_f64_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 8
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
@@ -321,8 +287,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x15
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -339,13 +305,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x54
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], v0, v0, s0
@@ -357,16 +322,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %
 ; GFX10-LABEL: test_div_scale_f32_scalar_num_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x54
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s0, v0, v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
@@ -389,8 +349,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -407,13 +367,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], s0, v0, s0
@@ -425,16 +384,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %
 ; GFX10-LABEL: test_div_scale_f32_scalar_num_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s0, s0, v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
@@ -457,8 +411,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -475,13 +429,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], s0, s0, v0
@@ -493,16 +446,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %
 ; GFX10-LABEL: test_div_scale_f32_scalar_den_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s0, s0, s0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
@@ -525,8 +473,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -543,13 +491,12 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], v0, s0, v0
@@ -561,16 +508,11 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %
 ; GFX10-LABEL: test_div_scale_f32_scalar_den_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s0, v0, s0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
@@ -593,13 +535,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)*
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
@@ -612,13 +553,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)*
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
@@ -630,18 +570,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -662,13 +597,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)*
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
@@ -681,13 +615,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)*
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
@@ -699,18 +632,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -731,13 +659,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)*
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
@@ -750,13 +677,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)*
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
@@ -768,18 +694,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -800,13 +721,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
@@ -819,13 +739,12 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
@@ -837,18 +756,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1056,8 +970,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)*
 ; GFX7-LABEL: test_div_scale_f32_inline_imm_num:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1073,13 +987,12 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)*
 ; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
@@ -1091,15 +1004,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, 1.0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
@@ -1120,8 +1028,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)*
 ; GFX7-LABEL: test_div_scale_f32_inline_imm_den:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1137,13 +1045,12 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)*
 ; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
@@ -1155,15 +1062,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s2, 2.0, 2.0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
@@ -1184,8 +1086,8 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX7-LABEL: test_div_scale_f32_fabs_num:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1204,13 +1106,12 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX8-LABEL: test_div_scale_f32_fabs_num:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
@@ -1227,23 +1128,16 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX10-LABEL: test_div_scale_f32_fabs_num:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    global_load_dword v1, v[2:3], off
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v1, v1, v0
+; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
@@ -1267,8 +1161,8 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX7-LABEL: test_div_scale_f32_fabs_den:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1286,13 +1180,12 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX8-LABEL: test_div_scale_f32_fabs_den:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
@@ -1308,22 +1201,15 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX10-LABEL: test_div_scale_f32_fabs_den:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    global_load_dword v1, v[2:3], off
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v1, v1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 3dedbec19679..88c82b1c3f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -8,13 +8,12 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; CI-LABEL: is_private_vgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x11
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -26,14 +25,9 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; GFX9-LABEL: is_private_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index f0eb57cef219..ec477c9925c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -8,13 +8,12 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; CI-LABEL: is_local_vgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -26,14 +25,9 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
 ; GFX9-LABEL: is_local_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 05717460bade..8eab3e78b0d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -39,13 +39,12 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
 ; GFX8-LABEL: update_dpp64_test:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
@@ -59,21 +58,16 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
 ; GFX10-LABEL: update_dpp64_test:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
-; GFX10-NEXT:    v_add_co_u32_e64 v6, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[6:7], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v[6:7], v[4:5], off
+; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 375b8713c4ae..95a32223ac3f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -202,72 +202,61 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 ad
 ; GFX7-LABEL: muli24_shl64:
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[2:3], v[0:1], 2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v7, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshl_b64 v[3:4], v[0:1], 3
-; GFX7-NEXT:    v_mov_b32_e32 v6, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v5, s0
+; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_or_b32_e32 v0, 0xff800000, v7
+; GFX7-NEXT:    v_or_b32_e32 v0, 0xff800000, v1
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v1, -7, v0
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[1:2], 3
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v5, v3
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v5
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: muli24_shl64:
 ; GFX8:       ; %bb.0: ; %bb
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    flat_load_dword v7, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], 3, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_or_b32_e32 v0, 0xff800000, v7
-; GFX8-NEXT:    v_mul_i32_i24_e32 v1, -7, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v3
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
+; GFX8-NEXT:    v_or_b32_e32 v0, 0xff800000, v4
+; GFX8-NEXT:    v_mul_i32_i24_e32 v0, -7, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: muli24_shl64:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], 3, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v1, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v0, 0xff800000, v7
-; GFX9-NEXT:    v_mul_i32_i24_e32 v1, -7, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_or_b32_e32 v1, 0xff800000, v1
+; GFX9-NEXT:    v_mul_i32_i24_e32 v1, -7, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
+; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()


        


More information about the llvm-commits mailing list