[llvm] c0581f7 - Revert D109159 : Revert "[amdgpu] Enable selection of `s_cselect_b64`."

David Salinas via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 11 13:14:40 PST 2022


Author: David Salinas
Date: 2022-01-11T21:14:09Z
New Revision: c0581f7df6856e4c7450f3752ed0146dc62f1873

URL: https://github.com/llvm/llvm-project/commit/c0581f7df6856e4c7450f3752ed0146dc62f1873
DIFF: https://github.com/llvm/llvm-project/commit/c0581f7df6856e4c7450f3752ed0146dc62f1873.diff

LOG: Revert D109159 : Revert "[amdgpu] Enable selection of `s_cselect_b64`."

This reverts commit 640beb38e7710b939b3cfb3f4c54accc694b1d30.

That commit caused performance degradtion in Quicksilver test QS:sGPU and a functional test failure in (rocPRIM rocprim.device_segmented_radix_sort).
Reverting until we have a better solution to s_cselect_b64 codegen cleanup

Change-Id: Ifc167b3c2dae7a65920676f22a97ba76485f3456

Reviewed By: kzhuravl

Differential Revision: https://reviews.llvm.org/D116686

Change-Id: I1abf49b74a7e2ba0e0205f747a4154a468b9d7f2

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
    llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
    llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
    llvm/test/CodeGen/AMDGPU/select64.ll
    llvm/test/CodeGen/AMDGPU/selectcc.ll
    llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
    llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bcf003843d4d..eb3ea41fd20e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8436,16 +8436,6 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Cond = Op.getOperand(0);
 
-  if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() &&
-      !Op->isDivergent()) {
-    if (VT == MVT::i64)
-      return Op;
-    SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1));
-    SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2));
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS));
-  }
-
   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
   SDValue One = DAG.getConstant(1, DL, MVT::i32);
 

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1755b93538ce..4b7f06996ed6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6122,11 +6122,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       continue;
 
     case AMDGPU::S_CSELECT_B32:
-      lowerSelect32(Worklist, Inst, MDT);
-      Inst.eraseFromParent();
-      continue;
     case AMDGPU::S_CSELECT_B64:
-      splitSelect64(Worklist, Inst, MDT);
+      lowerSelect(Worklist, Inst, MDT);
       Inst.eraseFromParent();
       continue;
     case AMDGPU::S_CMP_EQ_I32:
@@ -6304,8 +6301,8 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
   return std::make_pair(false, nullptr);
 }
 
-void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
-                                MachineDominatorTree *MDT) const {
+void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+                              MachineDominatorTree *MDT) const {
 
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6380,95 +6377,6 @@ void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
-                                MachineDominatorTree *MDT) const {
-  // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them
-  // further.
-  const DebugLoc &DL = Inst.getDebugLoc();
-  MachineBasicBlock::iterator MII = Inst;
-  MachineBasicBlock &MBB = *Inst.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  // Get the original operands.
-  MachineOperand &Dest = Inst.getOperand(0);
-  MachineOperand &Src0 = Inst.getOperand(1);
-  MachineOperand &Src1 = Inst.getOperand(2);
-  MachineOperand &Cond = Inst.getOperand(3);
-
-  Register SCCSource = Cond.getReg();
-  bool IsSCC = (SCCSource == AMDGPU::SCC);
-
-  // If this is a trivial select where the condition is effectively not SCC
-  // (SCCSource is a source of copy to SCC), then the select is semantically
-  // equivalent to copying SCCSource. Hence, there is no need to create
-  // V_CNDMASK, we can just use that and bail out.
-  if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) &&
-      (Src1.isImm() && Src1.getImm() == 0)) {
-    MRI.replaceRegWith(Dest.getReg(), SCCSource);
-    return;
-  }
-
-  // Prepare the split destination.
-  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-  // Split the source operands.
-  const TargetRegisterClass *Src0RC = nullptr;
-  const TargetRegisterClass *Src0SubRC = nullptr;
-  if (Src0.isReg()) {
-    Src0RC = MRI.getRegClass(Src0.getReg());
-    Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
-  }
-  const TargetRegisterClass *Src1RC = nullptr;
-  const TargetRegisterClass *Src1SubRC = nullptr;
-  if (Src1.isReg()) {
-    Src1RC = MRI.getRegClass(Src1.getReg());
-    Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
-  }
-  // Split lo.
-  MachineOperand SrcReg0Sub0 =
-      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
-  MachineOperand SrcReg1Sub0 =
-      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
-  // Split hi.
-  MachineOperand SrcReg0Sub1 =
-      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
-  MachineOperand SrcReg1Sub1 =
-      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
-  // Select the lo part.
-  MachineInstr *LoHalf =
-      BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0)
-          .add(SrcReg0Sub0)
-          .add(SrcReg1Sub0);
-  // Replace the condition operand with the original one.
-  LoHalf->getOperand(3).setReg(SCCSource);
-  Worklist.insert(LoHalf);
-  // Select the hi part.
-  MachineInstr *HiHalf =
-      BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1)
-          .add(SrcReg0Sub1)
-          .add(SrcReg1Sub1);
-  // Replace the condition operand with the original one.
-  HiHalf->getOperand(3).setReg(SCCSource);
-  Worklist.insert(HiHalf);
-  // Merge them back to the original 64-bit one.
-  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
-      .addReg(DestSub0)
-      .addImm(AMDGPU::sub0)
-      .addReg(DestSub1)
-      .addImm(AMDGPU::sub1);
-  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-
-  // Try to legalize the operands in case we need to swap the order to keep
-  // it valid.
-  legalizeOperands(*LoHalf, MDT);
-  legalizeOperands(*HiHalf, MDT);
-
-  // Move all users of this moved value.
-  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dd9ea2b53ca2..70a48cd58e38 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -78,11 +78,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
                    MachineDominatorTree *MDT = nullptr) const;
 
-  void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
-                     MachineDominatorTree *MDT = nullptr) const;
-
-  void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
-                     MachineDominatorTree *MDT = nullptr) const;
+  void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+                   MachineDominatorTree *MDT = nullptr) const;
 
   void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 453b2eee44de..324d36091827 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -518,10 +518,9 @@ let Uses = [SCC] in {
     def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
       [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))]
     >;
-    def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64",
-      [(set i64:$sdst, (SelectPat<select> i64:$src0, i64:$src1))]
-    >;
   }
+
+  def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
 } // End Uses = [SCC]
 
 let Defs = [SCC] in {

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 27215568482b..355158783b52 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -17,15 +17,17 @@
 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; GFX9-DAG: s_load_dword s[[PTR:[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
-; GFX9-DAG: s_lshl_b32 s[[SSRC_SHARED_BASE:[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
 
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
-; GFX9: s_cmp_lg_u32 s[[PTR]], -1
-; GFX9: s_cselect_b64 s{{\[}}[[SEL_LO:[0-9]+]]:[[SEL_HI:[0-9]+]]{{\]}}, s{{\[}}[[PTR]]:[[SSRC_SHARED_BASE]]{{\]}}, 0
-; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[SEL_LO]]
-; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[SEL_HI]]
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b64 vcc, -1, 0
+; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
 
@@ -82,17 +84,19 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 
-; GFX9-DAG: s_load_dword s[[PTR:[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
-; GFX9-DAG: s_lshl_b32 s[[SSRC_PRIVATE_BASE:[0-9]+]], [[SSRC_PRIVATE]], 16
+; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
 
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; GFX9: s_cmp_lg_u32 s[[PTR]], -1
-; GFX9: s_cselect_b64 s{{\[}}[[SEL_LO:[0-9]+]]:[[SEL_HI:[0-9]+]]{{\]}}, s{{\[}}[[PTR]]:[[SSRC_PRIVATE_BASE]]{{\]}}, 0
-; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[SEL_LO]]
-; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[SEL_HI]]
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b64 vcc, -1, 0
+; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index a8cefd4e50cf..e7a0847383b8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2,7 +2,6 @@
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX90A %s
 
 define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @udiv_i32(
@@ -95,34 +94,6 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s2, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -213,32 +184,6 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -362,43 +307,6 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_add_i32 s3, s3, s4
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX90A-NEXT:    s_add_i32 s2, s2, s5
-; GFX90A-NEXT:    s_xor_b32 s4, s5, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_xor_b32 s2, s2, s5
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s2, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -513,40 +421,6 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_add_i32 s3, s3, s4
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
-; GFX90A-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX90A-NEXT:    s_add_i32 s2, s2, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_xor_b32 s2, s2, s4
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -614,26 +488,6 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
 ; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX90A-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX90A-NEXT:    global_store_short v3, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i16 %x, %y
   store i16 %r, i16 addrspace(1)* %out
   ret void
@@ -708,28 +562,6 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_and_b32 s4, s2, 0xffff
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX90A-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    global_store_short v3, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i16 %x, %y
   store i16 %r, i16 addrspace(1)* %out
   ret void
@@ -811,31 +643,6 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
 ; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s4, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
-; GFX90A-NEXT:    global_store_short v1, v0, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i16 %x, %y
   store i16 %r, i16 addrspace(1)* %out
   ret void
@@ -924,33 +731,6 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s5, s4, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s0
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s5
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s6, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s5
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    global_store_short v1, v0, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i16 %x, %y
   store i16 %r, i16 addrspace(1)* %out
   ret void
@@ -1014,24 +794,6 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i8:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v1
-; GFX90A-NEXT:    v_mad_f32 v1, -v1, v0, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX90A-NEXT:    global_store_byte v2, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i8 %x, %y
   store i8 %r, i8 addrspace(1)* %out
   ret void
@@ -1104,27 +866,6 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i8:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s4
-; GFX90A-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v1
-; GFX90A-NEXT:    v_mad_f32 v1, -v1, v0, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    global_store_byte v2, v0, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i8 %x, %y
   store i8 %r, i8 addrspace(1)* %out
   ret void
@@ -1206,31 +947,6 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
 ; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i8:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x80008
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX90A-NEXT:    s_sext_i32_i8 s1, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
-; GFX90A-NEXT:    global_store_byte v1, v0, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i8 %x, %y
   store i8 %r, i8 addrspace(1)* %out
   ret void
@@ -1320,34 +1036,6 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i8:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x80008
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GFX90A-NEXT:    s_sext_i32_i8 s1, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_lshr_b32 s5, s4, 8
-; GFX90A-NEXT:    s_or_b32 s6, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v1|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
-; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX90A-NEXT:    global_store_byte v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i8 %x, %y
   store i8 %r, i8 addrspace(1)* %out
   ret void
@@ -1658,92 +1346,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v4i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b32 s3, 0x4f7ffffe
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s8
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s3, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s3, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s8, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s9
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s9
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s5, v2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s9, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s3, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s11
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s10
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, s10
-; GFX90A-NEXT:    v_mul_f32_e32 v5, s3, v5
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s6, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v2
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v6, s10, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s11
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v5, v3
-; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s11
-; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v2
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v6, s11, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
   store <4 x i32> %r, <4 x i32> addrspace(1)* %out
   ret void
@@ -2030,84 +1632,6 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v4i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b32 s12, 0x4f7ffffe
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_sub_i32 s3, 0, s9
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s12, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s12, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s10
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s9, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s9, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s10
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s11
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s10, v2
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s12, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s10, v2
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX90A-NEXT:    s_sub_i32 s2, 0, s11
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s11
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s11, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s11, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = urem <4 x i32> %x, %y
   store <4 x i32> %r, <4 x i32> addrspace(1)* %out
   ret void
@@ -2526,128 +2050,6 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v4i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b32 s13, 0x4f7ffffe
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX90A-NEXT:    s_add_i32 s3, s8, s2
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX90A-NEXT:    s_add_i32 s4, s4, s8
-; GFX90A-NEXT:    s_xor_b32 s2, s8, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_xor_b32 s4, s4, s8
-; GFX90A-NEXT:    s_sub_i32 s8, 0, s3
-; GFX90A-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s13, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s8, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX90A-NEXT:    s_add_i32 s4, s9, s12
-; GFX90A-NEXT:    s_xor_b32 s4, s4, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s13, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX90A-NEXT:    s_add_i32 s5, s5, s2
-; GFX90A-NEXT:    s_xor_b32 s3, s2, s12
-; GFX90A-NEXT:    s_xor_b32 s2, s5, s2
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s2, v2
-; GFX90A-NEXT:    s_ashr_i32 s2, s10, 31
-; GFX90A-NEXT:    s_add_i32 s5, s10, s2
-; GFX90A-NEXT:    s_xor_b32 s5, s5, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s5
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s4, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v5
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s3, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s13, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_subrev_u32_e32 v1, s3, v1
-; GFX90A-NEXT:    s_ashr_i32 s3, s6, 31
-; GFX90A-NEXT:    s_add_i32 s4, s6, s3
-; GFX90A-NEXT:    s_xor_b32 s2, s3, s2
-; GFX90A-NEXT:    s_xor_b32 s3, s4, s3
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s5
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, s5
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s3, v3
-; GFX90A-NEXT:    s_ashr_i32 s3, s11, 31
-; GFX90A-NEXT:    s_add_i32 s4, s11, s3
-; GFX90A-NEXT:    s_xor_b32 s4, s4, s3
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s4
-; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v2
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s5, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v6
-; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s13, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v2
-; GFX90A-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX90A-NEXT:    s_add_i32 s5, s7, s2
-; GFX90A-NEXT:    s_xor_b32 s3, s2, s3
-; GFX90A-NEXT:    s_xor_b32 s2, s5, s2
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s4
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s2, v5
-; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v6, s4, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s3, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v3
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <4 x i32> %x, %y
   store <4 x i32> %r, <4 x i32> addrspace(1)* %out
   ret void
@@ -3030,116 +2432,6 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v4i32:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b32 s12, 0x4f7ffffe
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX90A-NEXT:    s_add_i32 s3, s8, s2
-; GFX90A-NEXT:    s_xor_b32 s2, s3, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX90A-NEXT:    s_ashr_i32 s8, s9, 31
-; GFX90A-NEXT:    s_add_i32 s9, s9, s8
-; GFX90A-NEXT:    s_xor_b32 s8, s9, s8
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX90A-NEXT:    s_sub_i32 s9, 0, s2
-; GFX90A-NEXT:    s_ashr_i32 s3, s4, 31
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s12, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    s_add_i32 s4, s4, s3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX90A-NEXT:    s_xor_b32 s4, s4, s3
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s12, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s8
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX90A-NEXT:    s_add_i32 s3, s5, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s3, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s8
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s3, v1
-; GFX90A-NEXT:    s_ashr_i32 s3, s10, 31
-; GFX90A-NEXT:    s_add_i32 s4, s10, s3
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GFX90A-NEXT:    s_xor_b32 s3, s4, s3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s8, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
-; GFX90A-NEXT:    v_subrev_u32_e32 v1, s2, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    s_ashr_i32 s2, s6, 31
-; GFX90A-NEXT:    s_add_i32 s4, s6, s2
-; GFX90A-NEXT:    s_xor_b32 s4, s4, s2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s5, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
-; GFX90A-NEXT:    s_ashr_i32 s4, s11, 31
-; GFX90A-NEXT:    s_add_i32 s5, s11, s4
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    s_xor_b32 s4, s5, s4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s3, v2
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX90A-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s4
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v2
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s12, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX90A-NEXT:    s_add_i32 s3, s7, s2
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s3, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s4
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s3, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s4, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v5, s4, v3
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s2, v3
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v3
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = srem <4 x i32> %x, %y
   store <4 x i32> %r, <4 x i32> addrspace(1)* %out
   ret void
@@ -3349,65 +2641,6 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v4i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s8, s6, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX90A-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX90A-NEXT:    s_and_b32 s4, s4, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX90A-NEXT:    s_and_b32 s1, s7, s0
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v5, v6
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mad_f32 v3, -v1, v4, v5
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX90A-NEXT:    s_lshr_b32 s6, s7, 16
-; GFX90A-NEXT:    s_and_b32 s0, s5, s0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mad_f32 v6, -v1, v5, v6
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v7, v8
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_mad_f32 v5, -v5, v4, v7
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
-; GFX90A-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <4 x i16> %x, %y
   store <4 x i16> %r, <4 x i16> addrspace(1)* %out
   ret void
@@ -3641,73 +2874,6 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v4i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s8, s6, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX90A-NEXT:    s_and_b32 s9, s4, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_lshr_b32 s9, s6, 16
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s9
-; GFX90A-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    s_lshr_b32 s10, s7, 16
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v5, v6
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    s_and_b32 s4, s7, s0
-; GFX90A-NEXT:    v_mad_f32 v3, -v1, v4, v5
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_and_b32 s0, s5, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s1, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mad_f32 v6, -v1, v5, v6
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v7, v8
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_mad_f32 v5, -v5, v4, v7
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v4, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s8, v4
-; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
-; GFX90A-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = urem <4 x i16> %x, %y
   store <4 x i16> %r, <4 x i16> addrspace(1)* %out
   ret void
@@ -3972,84 +3138,6 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v4i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s6
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s8, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
-; GFX90A-NEXT:    s_ashr_i32 s1, s6, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v3
-; GFX90A-NEXT:    v_mul_f32_e32 v4, v1, v4
-; GFX90A-NEXT:    s_xor_b32 s0, s4, s1
-; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s7
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v4
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s5
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v1, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v1, -v5, v0, v1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    s_ashr_i32 s1, s7, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v5
-; GFX90A-NEXT:    s_ashr_i32 s0, s5, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v0
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX90A-NEXT:    v_mad_f32 v5, -v6, v0, v5
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v6
-; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX90A-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v3
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <4 x i16> %x, %y
   store <4 x i16> %r, <4 x i16> addrspace(1)* %out
   ret void
@@ -4338,92 +3426,6 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v4i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s6
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s8, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
-; GFX90A-NEXT:    s_ashr_i32 s8, s6, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s8
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX90A-NEXT:    s_xor_b32 s0, s4, s8
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s6, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX90A-NEXT:    v_mad_f32 v3, -v4, v1, v3
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v1|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
-; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v4
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s7
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s8
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s5
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s4, v1
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v1, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v1, -v5, v3, v1
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v3|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    s_ashr_i32 s4, s7, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX90A-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s5
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GFX90A-NEXT:    s_xor_b32 s0, s5, s4
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s6, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX90A-NEXT:    v_mad_f32 v5, -v6, v3, v5
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
-; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v3
-; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
-; GFX90A-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem <4 x i16> %x, %y
   store <4 x i16> %r, <4 x i16> addrspace(1)* %out
   ret void
@@ -4493,27 +3495,6 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i3:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_u32 s0, s4, 0x30008
-; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GFX90A-NEXT:    s_and_b32 s0, s4, 7
-; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v1
-; GFX90A-NEXT:    v_mad_f32 v1, -v1, v0, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    global_store_byte v2, v0, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i3 %x, %y
   store i3 %r, i3 addrspace(1)* %out
   ret void
@@ -4592,30 +3573,6 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i3:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_u32 s0, s4, 0x30008
-; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v1
-; GFX90A-NEXT:    s_and_b32 s1, s4, 7
-; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s1
-; GFX90A-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX90A-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX90A-NEXT:    v_mad_f32 v2, -v2, v1, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX90A-NEXT:    v_and_b32_e32 v1, 7, v1
-; GFX90A-NEXT:    global_store_byte v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i3 %x, %y
   store i3 %r, i3 addrspace(1)* %out
   ret void
@@ -4699,32 +3656,6 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i3:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x30008
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX90A-NEXT:    s_bfe_i32 s1, s4, 0x30000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
-; GFX90A-NEXT:    v_and_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    global_store_byte v1, v0, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i3 %x, %y
   store i3 %r, i3 addrspace(1)* %out
   ret void
@@ -4817,35 +3748,6 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i3:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x30008
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GFX90A-NEXT:    s_bfe_i32 s1, s4, 0x30000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_lshr_b32 s5, s4, 8
-; GFX90A-NEXT:    s_or_b32 s6, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v1|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
-; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX90A-NEXT:    v_and_b32_e32 v1, 7, v1
-; GFX90A-NEXT:    global_store_byte v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i3 %x, %y
   store i3 %r, i3 addrspace(1)* %out
   ret void
@@ -5011,53 +3913,6 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_short v1, v3, s[4:5] offset:4
 ; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v3i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s1, s2, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX90A-NEXT:    s_and_b32 s1, s6, s0
-; GFX90A-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s2
-; GFX90A-NEXT:    s_lshr_b32 s1, s6, 16
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    s_and_b32 s1, s3, s0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mad_f32 v3, -v2, v4, v5
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX90A-NEXT:    s_and_b32 s0, s7, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v6, v7
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v3
-; GFX90A-NEXT:    v_mad_f32 v3, -v3, v5, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX90A-NEXT:    global_store_short v1, v3, s[4:5] offset:4
-; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
   ret void
@@ -5245,59 +4100,6 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
 ; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v3i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s1, s2, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX90A-NEXT:    s_and_b32 s8, s6, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX90A-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s2
-; GFX90A-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s6
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v2, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s1
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    s_and_b32 s1, s3, s0
-; GFX90A-NEXT:    v_mad_f32 v3, -v2, v4, v5
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX90A-NEXT:    s_and_b32 s0, s7, s0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v6, v7
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v3
-; GFX90A-NEXT:    v_mad_f32 v3, -v3, v5, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s1
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s0, v3
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX90A-NEXT:    global_store_short v1, v3, s[4:5] offset:4
-; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = urem <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
   ret void
@@ -5503,67 +4305,6 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_short v1, v0, s[6:7] offset:4
 ; GFX9-NEXT:    global_store_dword v1, v2, s[6:7]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v3i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s2
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s8, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
-; GFX90A-NEXT:    s_ashr_i32 s1, s2, 16
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX90A-NEXT:    s_ashr_i32 s2, s4, 16
-; GFX90A-NEXT:    v_add_u32_e32 v2, s0, v3
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GFX90A-NEXT:    s_xor_b32 s0, s2, s1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s2, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX90A-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX90A-NEXT:    s_sext_i32_i16 s1, s3
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v4
-; GFX90A-NEXT:    s_sext_i32_i16 s0, s5
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s2, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v5
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX90A-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX90A-NEXT:    global_store_short v1, v0, s[6:7] offset:4
-; GFX90A-NEXT:    global_store_dword v1, v2, s[6:7]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
   ret void
@@ -5791,73 +4532,6 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
 ; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v3i16:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_sext_i32_i16 s8, s2
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s8
-; GFX90A-NEXT:    s_sext_i32_i16 s9, s4
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s9
-; GFX90A-NEXT:    s_xor_b32 s0, s9, s8
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    s_or_b32 s10, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s10, 0
-; GFX90A-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s2
-; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX90A-NEXT:    s_xor_b32 s0, s4, s2
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX90A-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX90A-NEXT:    v_mad_f32 v3, -v4, v2, v3
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX90A-NEXT:    s_or_b32 s8, s0, 1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
-; GFX90A-NEXT:    v_add_u32_e32 v2, s0, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX90A-NEXT:    s_sext_i32_i16 s2, s3
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s2
-; GFX90A-NEXT:    s_sext_i32_i16 s3, s5
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s3
-; GFX90A-NEXT:    s_xor_b32 s0, s3, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
-; GFX90A-NEXT:    s_or_b32 s4, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
-; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v5
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s9, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s3, v3
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX90A-NEXT:    global_store_short v1, v3, s[6:7] offset:4
-; GFX90A-NEXT:    global_store_dword v1, v0, s[6:7]
-; GFX90A-NEXT:    s_endpgm
   %r = srem <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
   ret void
@@ -6041,62 +4715,6 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v3i15:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX90A-NEXT:    s_and_b32 s3, s2, s6
-; GFX90A-NEXT:    s_and_b32 s7, s0, s6
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
-; GFX90A-NEXT:    s_bfe_u32 s0, s0, 0xf000f
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX90A-NEXT:    s_bfe_u32 s2, s2, 0xf000f
-; GFX90A-NEXT:    v_alignbit_b32 v3, s1, v3, 30
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX90A-NEXT:    v_and_b32_e32 v3, s6, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v4, -v5, v1, v4
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v7, v8
-; GFX90A-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mad_f32 v5, -v1, v6, v7
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v3
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v0, v7
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v1
-; GFX90A-NEXT:    v_mad_f32 v0, -v1, v3, v0
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
-; GFX90A-NEXT:    v_and_b32_e32 v3, s6, v4
-; GFX90A-NEXT:    v_and_b32_e32 v4, s6, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX90A-NEXT:    global_store_dword v2, v0, s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
-; GFX90A-NEXT:    global_store_short v2, v0, s[4:5] offset:4
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <3 x i15> %x, %y
   store <3 x i15> %r, <3 x i15> addrspace(1)* %out
   ret void
@@ -6302,70 +4920,6 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v3i15:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s7, s2, s6
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s7
-; GFX90A-NEXT:    s_bfe_u32 s8, s2, 0xf000f
-; GFX90A-NEXT:    s_and_b32 s9, s0, s6
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_bfe_u32 s7, s0, 0xf000f
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s7
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX90A-NEXT:    v_alignbit_b32 v3, s1, v3, 30
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v4, -v5, v1, v4
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX90A-NEXT:    v_and_b32_e32 v3, s6, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s2, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v7, v8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX90A-NEXT:    v_mad_f32 v7, -v1, v6, v7
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v8, v0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v9, v5
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
-; GFX90A-NEXT:    s_lshr_b32 s1, s0, 15
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s1
-; GFX90A-NEXT:    v_sub_u32_e32 v6, s3, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v8, v9
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v7, v1
-; GFX90A-NEXT:    v_mad_f32 v1, -v1, v5, v8
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, v3
-; GFX90A-NEXT:    v_and_b32_e32 v3, s6, v4
-; GFX90A-NEXT:    v_and_b32_e32 v4, s6, v6
-; GFX90A-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX90A-NEXT:    global_store_dword v2, v0, s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
-; GFX90A-NEXT:    global_store_short v2, v0, s[4:5] offset:4
-; GFX90A-NEXT:    s_endpgm
   %r = urem <3 x i15> %x, %y
   store <3 x i15> %r, <3 x i15> addrspace(1)* %out
   ret void
@@ -6589,76 +5143,6 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[6:7] offset:4
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v3i15:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_bfe_i32 s1, s2, 0xf0000
-; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0xf0000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s0
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX90A-NEXT:    s_or_b32 s3, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX90A-NEXT:    s_cselect_b32 s0, s3, 0
-; GFX90A-NEXT:    s_bfe_i32 s1, s4, 0xf000f
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s1
-; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v5
-; GFX90A-NEXT:    s_bfe_i32 s0, s2, 0xf000f
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    v_alignbit_b32 v1, s5, v1, 30
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
-; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    v_mad_f32 v5, -v6, v3, v5
-; GFX90A-NEXT:    v_bfe_i32 v1, v1, 0, 15
-; GFX90A-NEXT:    s_or_b32 s2, s0, 1
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX90A-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX90A-NEXT:    v_add_u32_e32 v5, s0, v6
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v6, v0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v3
-; GFX90A-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX90A-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v7, v1
-; GFX90A-NEXT:    v_mad_f32 v1, -v1, v3, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
-; GFX90A-NEXT:    s_movk_i32 s0, 0x7fff
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v3, s0, v4
-; GFX90A-NEXT:    v_and_b32_e32 v4, s0, v5
-; GFX90A-NEXT:    v_add_u32_e32 v0, v7, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX90A-NEXT:    global_store_dword v2, v0, s[6:7]
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
-; GFX90A-NEXT:    global_store_short v2, v0, s[6:7] offset:4
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <3 x i15> %x, %y
   store <3 x i15> %r, <3 x i15> addrspace(1)* %out
   ret void
@@ -6916,90 +5400,6 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v4, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v3i15:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s6, s2, s8
-; GFX90A-NEXT:    s_bfe_i32 s6, s6, 0xf0000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    v_alignbit_b32 v1, s1, v1, 30
-; GFX90A-NEXT:    s_and_b32 s1, s0, s8
-; GFX90A-NEXT:    s_bfe_i32 s1, s1, 0xf0000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s1
-; GFX90A-NEXT:    s_xor_b32 s1, s6, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    s_ashr_i32 s1, s1, 30
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX90A-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX90A-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX90A-NEXT:    s_bfe_u32 s9, s2, 0xf000f
-; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX90A-NEXT:    s_lshr_b32 s10, s0, 15
-; GFX90A-NEXT:    s_bfe_u32 s11, s0, 0xf000f
-; GFX90A-NEXT:    s_or_b32 s1, s1, 1
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, |v3|
-; GFX90A-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX90A-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX90A-NEXT:    v_add_u32_e32 v3, s1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s0
-; GFX90A-NEXT:    s_bfe_i32 s0, s11, 0xf0000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s0
-; GFX90A-NEXT:    s_bfe_i32 s1, s9, 0xf0000
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s2, v3
-; GFX90A-NEXT:    s_or_b32 s2, s0, 1
-; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX90A-NEXT:    v_mad_f32 v5, -v6, v4, v5
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
-; GFX90A-NEXT:    v_and_b32_e32 v1, s8, v1
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX90A-NEXT:    v_bfe_i32 v5, v1, 0, 15
-; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v6
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v6, v5
-; GFX90A-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX90A-NEXT:    v_bfe_i32 v7, v0, 0, 15
-; GFX90A-NEXT:    v_cvt_f32_i32_e32 v8, v7
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v9, v6
-; GFX90A-NEXT:    v_xor_b32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
-; GFX90A-NEXT:    v_or_b32_e32 v5, 1, v5
-; GFX90A-NEXT:    v_mul_f32_e32 v7, v8, v9
-; GFX90A-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX90A-NEXT:    v_cvt_i32_f32_e32 v9, v7
-; GFX90A-NEXT:    v_mad_f32 v7, -v7, v6, v8
-; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v4, s10
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s3, v4
-; GFX90A-NEXT:    v_add_u32_e32 v5, v9, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v5, v1
-; GFX90A-NEXT:    v_and_b32_e32 v4, s8, v4
-; GFX90A-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX90A-NEXT:    global_store_dword v2, v0, s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
-; GFX90A-NEXT:    global_store_short v2, v0, s[4:5] offset:4
-; GFX90A-NEXT:    s_endpgm
   %r = srem <3 x i15> %x, %y
   store <3 x i15> %r, <3 x i15> addrspace(1)* %out
   ret void
@@ -7041,21 +5441,6 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i32_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
-; GFX90A-NEXT:    s_sub_i32 s1, s4, s0
-; GFX90A-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX90A-NEXT:    s_add_i32 s1, s1, s0
-; GFX90A-NEXT:    s_lshr_b32 s0, s1, 20
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -7089,17 +5474,6 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b32 s0, s4, 12
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i32 %x, 4096
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -7136,18 +5510,6 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_add_i32 s0, s3, 12
-; GFX90A-NEXT:    s_lshr_b32 s0, s2, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
   store i32 %r, i32 addrspace(1)* %out
@@ -7191,19 +5553,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v2i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b32 s0, s2, 12
-; GFX90A-NEXT:    s_lshr_b32 s1, s3, 12
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
   ret void
@@ -7254,23 +5603,6 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_mul_hi_u32 s1, s3, 0x100101
-; GFX90A-NEXT:    s_lshr_b32 s0, s2, 12
-; GFX90A-NEXT:    s_sub_i32 s2, s3, s1
-; GFX90A-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX90A-NEXT:    s_add_i32 s2, s2, s1
-; GFX90A-NEXT:    s_lshr_b32 s1, s2, 11
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
   ret void
@@ -7452,58 +5784,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
-; GFX90A-NEXT:    s_mov_b32 s9, 0x4f7ffffe
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s2, s8, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX90A-NEXT:    s_lshl_b32 s0, s8, s3
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s9, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s9, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s6, v3
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v4, s2, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s7, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v4, s0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
@@ -7551,23 +5831,6 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i32_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
-; GFX90A-NEXT:    s_sub_i32 s1, s4, s0
-; GFX90A-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX90A-NEXT:    s_add_i32 s1, s1, s0
-; GFX90A-NEXT:    s_lshr_b32 s0, s1, 20
-; GFX90A-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
-; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -7601,17 +5864,6 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s0, s4, 0xfff
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i32 %x, 4096
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -7650,19 +5902,6 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s0, 0x1000, s3
-; GFX90A-NEXT:    s_add_i32 s0, s0, -1
-; GFX90A-NEXT:    s_and_b32 s0, s2, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
   store i32 %r, i32 addrspace(1)* %out
@@ -7708,20 +5947,6 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v2i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s1, s2, s0
-; GFX90A-NEXT:    s_and_b32 s0, s3, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
   ret void
@@ -7891,54 +6116,6 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v2i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s2, s8, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX90A-NEXT:    s_lshl_b32 s0, s8, s3
-; GFX90A-NEXT:    s_mov_b32 s3, 0x4f7ffffe
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s3, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s3, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s6, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX90A-NEXT:    s_sub_i32 s1, 0, s0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s7, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s7, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
@@ -7981,21 +6158,6 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i32_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
-; GFX90A-NEXT:    s_add_i32 s0, s0, s4
-; GFX90A-NEXT:    s_lshr_b32 s1, s0, 31
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 20
-; GFX90A-NEXT:    s_add_i32 s0, s0, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -8035,20 +6197,6 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
-; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    s_add_i32 s4, s4, s0
-; GFX90A-NEXT:    s_ashr_i32 s0, s4, 12
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i32 %x, 4096
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -8138,44 +6286,6 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_add_i32 s3, s3, s4
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s6, 0, s3
-; GFX90A-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX90A-NEXT:    s_add_i32 s2, s2, s5
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_xor_b32 s2, s2, s5
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s2, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    s_xor_b32 s2, s5, s4
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
   store i32 %r, i32 addrspace(1)* %out
@@ -8231,25 +6341,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v2i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s2, 31
-; GFX90A-NEXT:    s_ashr_i32 s1, s3, 31
-; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX90A-NEXT:    s_add_i32 s0, s2, s0
-; GFX90A-NEXT:    s_add_i32 s1, s3, s1
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX90A-NEXT:    s_ashr_i32 s1, s1, 12
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
   ret void
@@ -8306,26 +6397,6 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s2, 31
-; GFX90A-NEXT:    s_mul_hi_i32 s1, s3, 0x80080081
-; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    s_add_i32 s1, s1, s3
-; GFX90A-NEXT:    s_add_i32 s0, s2, s0
-; GFX90A-NEXT:    s_lshr_b32 s2, s1, 31
-; GFX90A-NEXT:    s_ashr_i32 s1, s1, 11
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX90A-NEXT:    s_add_i32 s1, s1, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
   ret void
@@ -8559,76 +6630,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s1, v1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_mov_b32 s10, 0x4f7ffffe
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s2, s8, s2
-; GFX90A-NEXT:    s_ashr_i32 s9, s2, 31
-; GFX90A-NEXT:    s_add_i32 s2, s2, s9
-; GFX90A-NEXT:    s_xor_b32 s2, s2, s9
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX90A-NEXT:    s_ashr_i32 s1, s6, 31
-; GFX90A-NEXT:    s_lshl_b32 s0, s8, s3
-; GFX90A-NEXT:    s_add_i32 s3, s6, s1
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_xor_b32 s6, s1, s9
-; GFX90A-NEXT:    s_xor_b32 s1, s3, s1
-; GFX90A-NEXT:    s_sub_i32 s3, 0, s2
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s10, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s1, v1
-; GFX90A-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX90A-NEXT:    s_add_i32 s0, s0, s1
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v4
-; GFX90A-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX90A-NEXT:    s_add_i32 s3, s7, s2
-; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_xor_b32 s1, s2, s1
-; GFX90A-NEXT:    s_xor_b32 s2, s3, s2
-; GFX90A-NEXT:    s_sub_i32 s3, 0, s0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s2, v3
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v4, s0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s6, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v1, s1, v1
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
@@ -8676,23 +6677,6 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i32_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
-; GFX90A-NEXT:    s_add_i32 s0, s0, s4
-; GFX90A-NEXT:    s_lshr_b32 s1, s0, 31
-; GFX90A-NEXT:    s_ashr_i32 s0, s0, 20
-; GFX90A-NEXT:    s_add_i32 s0, s0, s1
-; GFX90A-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
-; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -8734,21 +6718,6 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
-; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    s_add_i32 s0, s4, s0
-; GFX90A-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i32 %x, 4096
   store i32 %r, i32 addrspace(1)* %out
   ret void
@@ -8832,41 +6801,6 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_add_i32 s3, s3, s4
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
-; GFX90A-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX90A-NEXT:    s_add_i32 s2, s2, s4
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_xor_b32 s2, s2, s4
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s4, v0
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
   store i32 %r, i32 addrspace(1)* %out
@@ -8928,28 +6862,6 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v2i32_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s0, 0xf000
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s1, s2, 31
-; GFX90A-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX90A-NEXT:    s_add_i32 s1, s2, s1
-; GFX90A-NEXT:    s_ashr_i32 s6, s3, 31
-; GFX90A-NEXT:    s_and_b32 s1, s1, s0
-; GFX90A-NEXT:    s_sub_i32 s1, s2, s1
-; GFX90A-NEXT:    s_lshr_b32 s2, s6, 20
-; GFX90A-NEXT:    s_add_i32 s2, s3, s2
-; GFX90A-NEXT:    s_and_b32 s0, s2, s0
-; GFX90A-NEXT:    s_sub_i32 s0, s3, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
   ret void
@@ -9166,70 +7078,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v2i32_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
-; GFX90A-NEXT:    s_mov_b32 s9, 0x4f7ffffe
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b32 s1, s0, s2
-; GFX90A-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX90A-NEXT:    s_add_i32 s1, s1, s2
-; GFX90A-NEXT:    s_xor_b32 s1, s1, s2
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX90A-NEXT:    s_sub_i32 s8, 0, s1
-; GFX90A-NEXT:    s_ashr_i32 s2, s6, 31
-; GFX90A-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    s_add_i32 s3, s6, s2
-; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
-; GFX90A-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s9, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    s_add_i32 s0, s0, s6
-; GFX90A-NEXT:    s_xor_b32 s0, s0, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s8, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s1
-; GFX90A-NEXT:    v_sub_u32_e32 v0, s3, v0
-; GFX90A-NEXT:    v_subrev_u32_e32 v1, s1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s1, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s1, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX90A-NEXT:    s_sub_i32 s3, 0, s0
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX90A-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s9, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX90A-NEXT:    s_add_i32 s2, s7, s1
-; GFX90A-NEXT:    s_xor_b32 s2, s2, s1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v1, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX90A-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v1
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX90A-NEXT:    v_subrev_u32_e32 v1, s1, v1
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
@@ -9461,138 +7309,26 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v5, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], 2, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i64_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX90A-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b32 s0, 0x68958c89
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_mov_b32 s3, 0x976a7377
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
-; GFX90A-NEXT:    v_add_u32_e32 v4, v5, v4
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_movk_i32 s2, 0x11f
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s7, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s2
-; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v5
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX90A-NEXT:    s_movk_i32 s3, 0x11e
-; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
-; GFX90A-NEXT:    s_mov_b32 s6, 0x976a7376
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v3
-; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -9628,16 +7364,6 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv i64 %x, 4096
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -9678,18 +7404,6 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_add_i32 s2, s2, 12
-; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = udiv i64 %x, %shl.y
   store i64 %r, i64 addrspace(1)* %out
@@ -9737,21 +7451,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v2i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
@@ -9884,15 +7583,12 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s2, 0xf001
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0xfff
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
@@ -9917,8 +7613,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 12
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
@@ -9930,7 +7626,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    s_movk_i32 s4, 0xffe
+; GFX9-NEXT:    s_movk_i32 s0, 0xfff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
@@ -9952,138 +7650,38 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s8, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s0
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
-; GFX90A-NEXT:    s_movk_i32 s4, 0xf001
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s4, 0xfff
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
-; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s4, v3
-; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s4, 0xffe
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v0, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
@@ -10139,24 +7737,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_add_i32 s0, s4, 12
-; GFX90A-NEXT:    s_add_i32 s4, s6, 12
-; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = udiv <2 x i64> %x, %shl.y
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -10391,138 +7971,21 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i64_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX90A-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b32 s0, 0x689e0837
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_movk_i32 s8, 0x11f
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
-; GFX90A-NEXT:    v_add_u32_e32 v4, v5, v4
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    s_mov_b32 s9, 0x9761f7c9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s8
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s9
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s9
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s9, v0
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
-; GFX90A-NEXT:    s_movk_i32 s6, 0x11e
-; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
-; GFX90A-NEXT:    s_mov_b32 s10, 0x9761f7c8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -10557,16 +8020,6 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s2, s2, 0xfff
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = urem i64 %x, 4096
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -10613,21 +8066,6 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b64 s[0:1], 0x1000
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; GFX90A-NEXT:    s_add_u32 s0, s0, -1
-; GFX90A-NEXT:    s_addc_u32 s1, s1, -1
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = urem i64 %x, %shl.y
   store i64 %r, i64 addrspace(1)* %out
@@ -10676,21 +8114,6 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v2i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_and_b32 s1, s4, s0
-; GFX90A-NEXT:    s_and_b32 s0, s6, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90A-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = urem <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
@@ -10756,29 +8179,6 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: urem_v2i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b64 s[0:1], 0x1000
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
-; GFX90A-NEXT:    s_add_u32 s0, s0, -1
-; GFX90A-NEXT:    s_addc_u32 s1, s1, -1
-; GFX90A-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
-; GFX90A-NEXT:    s_add_u32 s4, s6, -1
-; GFX90A-NEXT:    s_addc_u32 s5, s7, -1
-; GFX90A-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = urem <2 x i64> %x, %shl.y
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -10913,18 +8313,18 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xffed2705
+; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -10943,12 +8343,12 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
+; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v2
@@ -10966,167 +8366,60 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s1, s7, s2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0x12d8fb
+; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s5
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s5, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s2, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i64_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffed2705
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    s_add_u32 s2, s2, s4
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_mov_b32 s5, s4
-; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s3, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_mov_b32 s5, 0x12d8fb
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s5
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s5
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s5
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
-; GFX90A-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s5, v4
-; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_mov_b32 s2, 0x12d8fa
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, -1, v4, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s4
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -11170,20 +8463,6 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX90A-NEXT:    s_add_u32 s2, s2, s4
-; GFX90A-NEXT:    s_addc_u32 s3, s3, 0
-; GFX90A-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv i64 %x, 4096
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -11441,22 +8720,25 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -11464,137 +8746,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
-; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX90A-NEXT:    s_add_u32 s4, s4, s2
-; GFX90A-NEXT:    s_mov_b32 s3, s2
-; GFX90A-NEXT:    s_addc_u32 s5, s5, s2
-; GFX90A-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_sub_u32 s0, 0, s8
-; GFX90A-NEXT:    s_subb_u32 s1, 0, s9
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s0, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s9, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s7, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s9
-; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v5
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s1
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
   store i64 %r, i64 addrspace(1)* %out
@@ -11658,29 +8809,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v2i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
-; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    s_add_u32 s0, s4, s0
-; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
-; GFX90A-NEXT:    s_ashr_i32 s4, s7, 31
-; GFX90A-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
-; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX90A-NEXT:    s_add_u32 s4, s6, s4
-; GFX90A-NEXT:    s_addc_u32 s5, s7, 0
-; GFX90A-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
@@ -11836,14 +8964,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
-; GFX9-NEXT:    s_add_u32 s2, s4, s2
-; GFX9-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
@@ -11855,8 +8978,6 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
-; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
@@ -11867,8 +8988,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
-; GFX9-NEXT:    s_add_u32 s6, s6, s4
-; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v2
@@ -11880,12 +9002,19 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX9-NEXT:    s_add_u32 s2, s4, s2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
+; GFX9-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    s_add_u32 s6, s6, s4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, s4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
@@ -11898,38 +9027,40 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_movk_i32 s5, 0xfff
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s5
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s5, v5
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    s_movk_i32 s5, 0xffe
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
@@ -11937,128 +9068,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x457ff000
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
-; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    s_add_u32 s0, s4, s0
-; GFX90A-NEXT:    s_movk_i32 s4, 0xf001
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
-; GFX90A-NEXT:    s_ashr_i32 s4, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_add_u32 s6, s6, s4
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    s_mov_b32 s5, s4
-; GFX90A-NEXT:    s_addc_u32 s7, s7, s4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s5, 0xfff
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s5
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s5
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
-; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s5, v3
-; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s5, 0xffe
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s4
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s4, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
@@ -12432,6 +9443,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s5, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v0, vcc
@@ -12455,68 +9467,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v2, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v1
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v1
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v2, s[0:1]
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX9-NEXT:    s_add_u32 s8, s8, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, s4
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s9
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s9
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
-; GFX9-NEXT:    v_mac_f32_e32 v7, s16, v8
-; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    v_mac_f32_e32 v9, s16, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v9
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v7
-; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, s18, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    s_subb_u32 s11, 0, s9
-; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
-; GFX9-NEXT:    v_mul_lo_u32 v7, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, s11, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v3
-; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v3
+; GFX9-NEXT:    v_mul_lo_u32 v7, s1, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v3
+; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v5
@@ -12533,12 +9547,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    s_add_u32 s6, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    s_mov_b32 s11, s10
-; GFX9-NEXT:    s_addc_u32 s7, s7, s10
+; GFX9-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s6, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
@@ -12548,7 +9560,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s7, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s1
+; GFX9-NEXT:    v_xor_b32_e32 v1, s12, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, s13, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v0, vcc
@@ -12557,7 +9570,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s9, v3
-; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v1
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
@@ -12574,22 +9588,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 2, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 1, 2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v3, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v4, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
@@ -12598,262 +9615,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX90A-NEXT:    s_mov_b32 s16, 0x4f800000
-; GFX90A-NEXT:    s_mov_b32 s17, 0x5f7ffffc
-; GFX90A-NEXT:    s_mov_b32 s18, 0x2f800000
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
-; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX90A-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX90A-NEXT:    s_add_u32 s2, s2, s10
-; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    s_addc_u32 s3, s3, s10
-; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_sub_u32 s0, 0, s12
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_subb_u32 s1, 0, s13
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_mov_b32 s15, s14
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s12, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s12, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s13
-; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s12, v5
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v7, v6, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 1, 2, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, s5
-; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v0, v3
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
-; GFX90A-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
-; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
-; GFX90A-NEXT:    s_add_u32 s8, s8, s4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GFX90A-NEXT:    s_mov_b32 s5, s4
-; GFX90A-NEXT:    s_addc_u32 s9, s9, s4
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX90A-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s9
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
-; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX90A-NEXT:    s_sub_u32 s0, 0, s8
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s18, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    s_subb_u32 s1, 0, s9
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s0, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s0, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s1, v2
-; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s0, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v9, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v10, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s1, v2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v11, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v11
-; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v10, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s6, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v2
-; GFX90A-NEXT:    v_sub_u32_e32 v6, s7, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v8, s9
-; GFX90A-NEXT:    v_sub_co_u32_e32 v7, vcc, s6, v7
-; GFX90A-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v8, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v7
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v9, v8, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v9, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
-; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v2, v6
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v2, s0, v2
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -12986,18 +9747,18 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xffed2705
+; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -13016,12 +9777,12 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
+; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v2
@@ -13039,169 +9800,58 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s1, s7, s2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
+; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0x12d8fb
+; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s5, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s5, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s3, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s2, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i64_oddk_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_mov_b32 s4, 0xffed2705
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s4
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    s_add_u32 s2, s2, s4
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_mov_b32 s5, s4
-; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s3, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_mov_b32 s5, 0x12d8fb
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s5
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s5
-; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v3, vcc, s5, v0
-; GFX90A-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s5, v3
-; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc
-; GFX90A-NEXT:    s_mov_b32 s2, 0x12d8fa
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s4
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -13249,22 +9899,6 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX90A-NEXT:    s_add_u32 s4, s2, s4
-; GFX90A-NEXT:    s_addc_u32 s5, s3, 0
-; GFX90A-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; GFX90A-NEXT:    s_sub_u32 s2, s2, s4
-; GFX90A-NEXT:    s_subb_u32 s3, s3, s5
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90A-NEXT:    s_endpgm
   %r = srem i64 %x, 4096
   store i64 %r, i64 addrspace(1)* %out
   ret void
@@ -13524,19 +10158,19 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
@@ -13544,138 +10178,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_add_u32 s2, s2, s4
-; GFX90A-NEXT:    s_mov_b32 s5, s4
-; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
-; GFX90A-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_sub_u32 s0, 0, s8
-; GFX90A-NEXT:    s_subb_u32 s1, 0, s9
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s0, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s9
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s10, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s10, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
   store i64 %r, i64 addrspace(1)* %out
@@ -13749,34 +10251,6 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v2i64_pow2k_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_movk_i32 s0, 0xf000
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s1, s5, 31
-; GFX90A-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX90A-NEXT:    s_add_u32 s1, s4, s1
-; GFX90A-NEXT:    s_addc_u32 s8, s5, 0
-; GFX90A-NEXT:    s_and_b32 s1, s1, s0
-; GFX90A-NEXT:    s_sub_u32 s1, s4, s1
-; GFX90A-NEXT:    s_subb_u32 s4, s5, s8
-; GFX90A-NEXT:    s_ashr_i32 s5, s7, 31
-; GFX90A-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX90A-NEXT:    s_add_u32 s5, s6, s5
-; GFX90A-NEXT:    s_addc_u32 s8, s7, 0
-; GFX90A-NEXT:    s_and_b32 s0, s5, s0
-; GFX90A-NEXT:    s_sub_u32 s0, s6, s0
-; GFX90A-NEXT:    s_subb_u32 s5, s7, s8
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %r = srem <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
@@ -14166,35 +10640,35 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v6
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
+; GFX9-NEXT:    s_ashr_i32 s2, s11, 31
+; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
+; GFX9-NEXT:    s_add_u32 s10, s10, s2
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v5
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s11, s11, s2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
-; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX9-NEXT:    s_add_u32 s2, s10, s0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_addc_u32 s3, s11, s0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
+; GFX9-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v6, v2, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX9-NEXT:    v_mac_f32_e32 v7, s16, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
-; GFX9-NEXT:    v_mac_f32_e32 v5, s16, v6
-; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v5, s18, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
@@ -14205,7 +10679,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v4
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
@@ -14293,19 +10767,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v8, s7
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v4, s12, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s12
@@ -14314,264 +10788,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: srem_v2i64_pow2_shl_denom:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX90A-NEXT:    s_mov_b32 s16, 0x4f800000
-; GFX90A-NEXT:    s_mov_b32 s17, 0x5f7ffffc
-; GFX90A-NEXT:    s_mov_b32 s18, 0x2f800000
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
-; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX90A-NEXT:    s_add_u32 s2, s2, s4
-; GFX90A-NEXT:    s_mov_b32 s5, s4
-; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
-; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_sub_u32 s0, 0, s12
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_subb_u32 s1, 0, s13
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_mov_b32 s15, s14
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v6, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v2, s5, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v0
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v6
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GFX90A-NEXT:    s_add_u32 s2, s10, s0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX90A-NEXT:    s_mov_b32 s1, s0
-; GFX90A-NEXT:    s_addc_u32 s3, s11, s0
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s5
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s14, v0
-; GFX90A-NEXT:    s_sub_u32 s0, 0, s4
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
-; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s14
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s18, v2
-; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    s_subb_u32 s1, 0, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s0, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s0, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s1, v2
-; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s0, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v9, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v10, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s0, v2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s1, v2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v11, v2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v11
-; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v10, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s6, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v2
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
-; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
-; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v2
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v7
-; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, v8
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v7
-; GFX90A-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
-; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v2, s10, v2
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s10, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s10
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
-; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index 0be2e867e3cf..d10d0dd74741 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -159,7 +159,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(<4 x i
 }
 
 ; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64:
-; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 0, 5
+; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
 define amdgpu_kernel void @sdiv_constant_sel_constants_i64(i64 addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, i64 121, i64 23
   %bo = sdiv i64 120, %sel
@@ -177,7 +177,7 @@ define amdgpu_kernel void @sdiv_constant_sel_constants_i32(i32 addrspace(1)* %p,
 }
 
 ; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64:
-; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 0, 5
+; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
 define amdgpu_kernel void @udiv_constant_sel_constants_i64(i64 addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, i64 -4, i64 23
   %bo = udiv i64 120, %sel
@@ -186,7 +186,7 @@ define amdgpu_kernel void @udiv_constant_sel_constants_i64(i64 addrspace(1)* %p,
 }
 
 ; GCN-LABEL: {{^}}srem_constant_sel_constants:
-; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 33, 3
+; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
 define amdgpu_kernel void @srem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, i64 34, i64 15
   %bo = srem i64 33, %sel
@@ -195,7 +195,7 @@ define amdgpu_kernel void @srem_constant_sel_constants(i64 addrspace(1)* %p, i1
 }
 
 ; GCN-LABEL: {{^}}urem_constant_sel_constants:
-; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 33, 3
+; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
 define amdgpu_kernel void @urem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, i64 34, i64 15
   %bo = urem i64 33, %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index b66ab4e577aa..ad255818c9fe 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -38,23 +38,16 @@ entry:
 
 ; GCN-LABEL: {{^}}double4_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b
-; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1
-; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29
-; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5
 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}}
-; GCN-DAG: s_mov_b32 s[[L2LO:[0-9]+]], 0xe147ae14
-; GCN-DAG: s_mov_b32 s[[L2HI:[0-9]+]], 0x4000147a
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
-; GCN: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}, s{{\[}}[[L2LO]]:[[L2HI]]{{\]}}
-; GCN-DAG: s_mov_b32 s[[L3LO:[0-9]+]], 0x70a3d70a
-; GCN-DAG: s_mov_b32 s[[L3HI:[0-9]+]], 0x40100a3d
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L3HI]]{{\]}}
-; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]]
-; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]]
-; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
@@ -64,27 +57,19 @@ entry:
 
 ; GCN-LABEL: {{^}}double5_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b
-; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1
-; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29
-; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5
 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}}
-; GCN-DAG: s_mov_b32 s[[L2LO:[0-9]+]], 0xe147ae14
-; GCN-DAG: s_mov_b32 s[[L2HI:[0-9]+]], 0x4000147a
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
-; GCN: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}, s{{\[}}[[L2LO]]:[[L2HI]]{{\]}}
-; GCN-DAG: s_mov_b32 s[[L3LO:[0-9]+]], 0x70a3d70a
-; GCN-DAG: s_mov_b32 s[[L3HI:[0-9]+]], 0x40100a3d
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L3HI]]{{\]}}
-; Double literals 5.01 and 4.01 share the same low 32 bits.
-; GCN-DAG: s_mov_b32 s[[L4HI:[0-9]+]], 0x40140a3d
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
-; GCN: s_cselect_b64 s{{\[}}[[T3LO:[0-9]+]]:[[T3HI:[0-9]+]]{{\]}}, s{{\[}}[[T2LO]]:[[T2HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L4HI]]{{\]}}
-; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T3LO]]
-; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T3HI]]
-; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
@@ -122,15 +107,11 @@ entry:
 
 ; GCN-LABEL: {{^}}double2_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b
-; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1
-; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29
-; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5
 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}}
-; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T0LO]]
-; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T0HI]]
-; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 35b2d4d8306d..094ae27b5c57 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2:
 ; GCN: buffer_load_dwordx4
@@ -14,22 +14,15 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
 ; GCN-NOT: buffer_load
-; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; SI-DAG: s_cmp_eq_u32 [[IDX]], 2
-; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI: store_dwordx2 v[{{[0-9:]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2
-; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T1LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T1HI]]
-; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
@@ -38,28 +31,19 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
 ; GCN-NOT: buffer_load
-; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; SI-DAG: s_cmp_eq_u32 [[IDX]], 2
-; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; SI-DAG: s_cmp_eq_u32 [[IDX]], 3
-; SI-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
-; SI: store_dwordx2 v[{{[0-9:]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2
-; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 3
-; VI: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]]
-; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 985490592487..248f5fc985ee 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; How the replacement of i64 stores with v2i32 stores resulted in
 ; breaking other users of the bitcast if they already existed
@@ -32,14 +32,10 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
 ; GCN-NOT: buffer_load
 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI: store_dwordx2 v[{{[0-9:]+}}]
-; VI: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]]
-; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
@@ -63,23 +59,16 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
-; SI-NOT: buffer_load
-; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; SI-DAG: s_cmp_eq_u32 [[IDX]], 2
-; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI: store_dwordx2 v[{{[0-9:]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2
-; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T1LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T1HI]]
-; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-NOT: buffer_load
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
@@ -88,28 +77,19 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
 ; GCN-NOT: buffer_load
-; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; SI-DAG: s_cmp_eq_u32 [[IDX]], 2
-; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; SI-DAG: s_cmp_eq_u32 [[IDX]], 3
-; SI-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
-; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
-; SI: store_dwordx2 v[{{[0-9:]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2
-; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}
-; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 3
-; VI: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]]
-; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index e216a0b4562c..2afce5352b60 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
 ; We have an indirect call with a known set of callees, which are
@@ -6,12 +7,54 @@
 
 ; FIXME: Passing real values for workitem ID, and 0s that can be undef
 
+define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; CHECK-LABEL: indirect_call_known_no_special_inputs:
-
-; CHECK-DAG: s_cselect_b64 [[CALL_TARGET:s\[[0-9]+:[0-9]+\]]]
-; CHECK-DAG: s_mov_b64 s[8:9], 0
-; CHECK-DAG: v_mov_b32_e32 v31, v0
-; CHECK: s_swappc_b64 s[30:31], [[CALL_TARGET]]
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s33, s6
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_bitcmp1_b32 s4, 0
+; CHECK-NEXT:    s_cselect_b64 vcc, -1, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, snork at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, snork at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], exec
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s9
+; CHECK-NEXT:    v_mov_b32_e32 v1, s11
+; CHECK-NEXT:    v_mov_b32_e32 v2, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s10
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
+; CHECK-NEXT:    s_and_saveexec_b64 s[34:35], vcc
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
+; CHECK-NEXT:    s_mov_b32 s12, s33
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    ; implicit-def: $vgpr31
+; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:    s_xor_b64 exec, exec, s[34:35]
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2:
+; CHECK-NEXT:    s_endpgm
 
 ; CHECK: .amdhsa_kernarg_size 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -27,7 +70,6 @@
 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
 ; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
-define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 bb:
   %cond = load i1, i1 addrspace(4)* null
   %tmp = select i1 %cond, void (i8*, i32, i8*)* bitcast (void ()* @wobble to void (i8*, i32, i8*)*), void (i8*, i32, i8*)* bitcast (void ()* @snork to void (i8*, i32, i8*)*)
@@ -36,11 +78,19 @@ bb:
 }
 
 define void @wobble() {
+; CHECK-LABEL: wobble:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   ret void
 }
 
 define void @snork() {
+; CHECK-LABEL: snork:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 2ea25fe7a31c..3a4923693470 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -285,18 +285,16 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}double2_inselt:
-; GCN: s_load_dwordx4 s{{\[}}[[FIRST:[0-9]+]]:[[LAST:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1
-; GCN: s_cselect_b64 s{{\[}}[[P0_LO:[0-9]+]]:[[P0_HI:[0-9]+]]{{\]}}, s{{\[}}{{[0-9]+}}:[[LAST]]{{\]}}, 1.0
-; GCN: s_cmp_lg_u32 [[IDX]], 0
-; GCN: s_cselect_b64 s{{\[}}[[P1_LO:[0-9]+]]:[[P1_HI:[0-9]+]]{{\]}}, s{{\[}}[[FIRST]]:{{[0-9]+}}{{\]}}, 1.0
-; GCN: v_mov_b32_e32 v[[V_FIRST:[0-9]+]], s[[P1_LO]]
-; GCN: v_mov_b32_e32 v[[V_SECOND:[0-9]+]], s[[P1_HI]]
-; GCN: v_mov_b32_e32 v[[V_THIRD:[0-9]+]], s[[P0_LO]]
-; GCN: v_mov_b32_e32 v[[V_LAST:[0-9]+]], s[[P0_HI]]
-; GCN: flat_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_FIRST]]:[[V_LAST]]{{\]}}
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 0
+; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
+; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
 define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
 entry:
   %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
@@ -307,7 +305,7 @@ entry:
 ; GCN-LABEL: {{^}}double5_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-COUNT-5: s_cselect_b64
+; GCN-COUNT-10: v_cndmask_b32
 define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
 entry:
   %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index bbdff9c4e897..89d319d24bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1627,23 +1627,26 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
 ;
 ; VI-LABEL: dynamic_insertelement_v2f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s10, s[4:5], 0x60
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x30
-; VI-NEXT:    s_mov_b32 s8, 0
-; VI-NEXT:    s_mov_b32 s9, 0x40200000
-; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_load_dword s8, s[4:5], 0x60
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x30
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
+; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lg_u32 s10, 1
-; VI-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
-; VI-NEXT:    s_cmp_lg_u32 s10, 0
-; VI-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_cmp_eq_u32 s8, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_cmp_eq_u32 s8, 0
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
@@ -1676,20 +1679,24 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
 ;
 ; VI-LABEL: dynamic_insertelement_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
+; VI-NEXT:    s_load_dword s10, s[4:5], 0x20
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lg_u32 s8, 1
-; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], 5
-; VI-NEXT:    s_cmp_lg_u32 s8, 0
-; VI-NEXT:    s_cselect_b64 s[0:1], s[0:1], 5
+; VI-NEXT:    s_cmp_eq_u32 s10, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_cmp_eq_u32 s10, 0
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
@@ -1737,20 +1744,26 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x30
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lg_u32 s12, 1
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_cselect_b64 s[6:7], s[10:11], 5
-; VI-NEXT:    s_cmp_lg_u32 s12, 0
-; VI-NEXT:    s_cselect_b64 s[8:9], s[8:9], 5
-; VI-NEXT:    s_cmp_lg_u32 s12, 2
-; VI-NEXT:    s_cselect_b64 s[4:5], s[4:5], 5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_cmp_eq_u32 s12, 1
+; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s11
+; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cmp_eq_u32 s12, 0
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s9
+; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_cmp_eq_u32 s12, 2
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
+; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
@@ -1798,32 +1811,38 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
 ;
 ; VI-LABEL: dynamic_insertelement_v4f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s16, s[4:5], 0x40
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    s_mov_b32 s5, 0x40200000
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lg_u32 s16, 1
-; VI-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[4:5]
-; VI-NEXT:    s_cmp_lg_u32 s16, 0
-; VI-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[4:5]
-; VI-NEXT:    s_cmp_lg_u32 s16, 3
-; VI-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[4:5]
-; VI-NEXT:    s_cmp_lg_u32 s16, 2
-; VI-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; VI-NEXT:    s_nop 0
+; VI-NEXT:    s_cmp_eq_u32 s6, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s11
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cmp_eq_u32 s6, 0
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s9
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_cmp_eq_u32 s6, 3
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, s15
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, s14
+; VI-NEXT:    s_cmp_eq_u32 s6, 2
+; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 618236dd8645..dc03f7180e86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -345,39 +345,46 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s9, s0, s3
-; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s2
-; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT:    s_add_u32 s6, s10, s9
-; GFX9-NEXT:    s_mul_i32 s8, s1, s2
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_i32 s7, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
+; GFX9-NEXT:    s_add_u32 s9, s8, s7
+; GFX9-NEXT:    s_mul_i32 s5, s1, s2
+; GFX9-NEXT:    s_addc_u32 s6, 0, s6
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT:    s_add_u32 s6, s6, s8
-; GFX9-NEXT:    s_mul_hi_i32 s7, s1, s3
-; GFX9-NEXT:    s_addc_u32 s4, s5, s4
-; GFX9-NEXT:    s_addc_u32 s5, s7, 0
-; GFX9-NEXT:    s_mul_i32 s6, s1, s3
-; GFX9-NEXT:    s_add_u32 s4, s4, s6
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_sub_u32 s6, s4, s2
-; GFX9-NEXT:    s_subb_u32 s7, s5, 0
+; GFX9-NEXT:    s_add_u32 s9, s9, s5
+; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
+; GFX9-NEXT:    s_addc_u32 s4, s6, s4
+; GFX9-NEXT:    s_addc_u32 s6, s10, 0
+; GFX9-NEXT:    s_mul_i32 s9, s1, s3
+; GFX9-NEXT:    s_add_u32 s4, s4, s9
+; GFX9-NEXT:    s_addc_u32 s6, 0, s6
+; GFX9-NEXT:    s_sub_u32 s9, s4, s2
+; GFX9-NEXT:    s_subb_u32 s10, s6, 0
 ; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_sub_u32 s6, s4, s0
-; GFX9-NEXT:    s_subb_u32 s7, s5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
 ; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_add_i32 s1, s10, s9
-; GFX9-NEXT:    s_add_i32 s1, s1, s8
-; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX9-NEXT:    s_mul_i32 s2, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    s_add_i32 s1, s8, s7
+; GFX9-NEXT:    s_add_i32 s1, s1, s5
+; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_mul_i32 s0, s0, s2
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -385,37 +392,42 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s9, s0, s3
-; GFX10-NEXT:    s_mul_hi_u32 s10, s0, s2
-; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT:    s_mul_i32 s8, s1, s2
-; GFX10-NEXT:    s_add_u32 s11, s10, s9
+; GFX10-NEXT:    s_mul_i32 s7, s0, s3
+; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
+; GFX10-NEXT:    s_mul_i32 s5, s1, s2
+; GFX10-NEXT:    s_add_u32 s11, s8, s7
 ; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_mul_hi_i32 s6, s1, s3
-; GFX10-NEXT:    s_add_u32 s11, s11, s8
-; GFX10-NEXT:    s_mul_i32 s7, s1, s3
-; GFX10-NEXT:    s_addc_u32 s4, s5, s4
-; GFX10-NEXT:    s_addc_u32 s5, s6, 0
-; GFX10-NEXT:    s_add_u32 s4, s4, s7
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_sub_u32 s6, s4, s2
-; GFX10-NEXT:    s_subb_u32 s7, s5, 0
+; GFX10-NEXT:    s_addc_u32 s6, 0, s6
+; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
+; GFX10-NEXT:    s_add_u32 s11, s11, s5
+; GFX10-NEXT:    s_mul_i32 s10, s1, s3
+; GFX10-NEXT:    s_addc_u32 s4, s6, s4
+; GFX10-NEXT:    s_addc_u32 s6, s9, 0
+; GFX10-NEXT:    s_add_u32 s4, s4, s10
+; GFX10-NEXT:    s_addc_u32 s6, 0, s6
+; GFX10-NEXT:    s_sub_u32 s9, s4, s2
+; GFX10-NEXT:    s_subb_u32 s10, s6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_u32 s6, s4, s0
-; GFX10-NEXT:    s_subb_u32 s7, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s10
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_add_i32 s1, s8, s7
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s2
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_add_i32 s1, s10, s9
-; GFX10-NEXT:    s_add_i32 s1, s1, s8
-; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
-; GFX10-NEXT:    s_mov_b32 s7, s6
-; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
+; GFX10-NEXT:    s_add_i32 s1, s1, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 3634cedfb0bb..407a4e5f1b76 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -9,7 +9,8 @@
 ; GCN: s_load_dwordx2
 
 ; GCN: s_cmp_eq_u32
-; GCN: s_cselect_b64
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
 
 ; GCN-NOT: load_dword
 ; GCN: flat_load_dwordx2
@@ -34,7 +35,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
-; GCN: s_cselect_b64
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
 ; GCN: flat_store_dwordx2
 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index 5a8b83c52370..61ce9c12526d 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -1,53 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=VI %s
-; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefix=GFX90A %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s
 
+; GCN-LABEL: {{^}}select0:
+; i64 select should be split into two i32 selects, and we shouldn't need
+; to use a shfit to extract the hi dword of the input.
+; GCN-NOT: s_lshr_b64
+; GCN: v_cndmask
+; GCN: v_cndmask
 define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
-; SI-LABEL: select0:
-; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lt_u32 s6, 6
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: select0:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lt_u32 s4, 6
-; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; VI-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: select0:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cmp_lt_u32 s6, 6
-; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
 entry:
   %0 = icmp ugt i32 %cond, 5
   %1 = select i1 %0, i64 0, i64 %in
@@ -55,48 +15,12 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}select_trunc_i64:
+; VI: s_cselect_b32
+; VI-NOT: s_cselect_b32
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
 define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
-; SI-LABEL: select_trunc_i64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
-; SI-NEXT:    s_load_dword s5, s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lt_u32 s4, 6
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: select_trunc_i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s3, s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lt_u32 s2, 6
-; VI-NEXT:    s_cselect_b32 s2, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: select_trunc_i64:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dword s5, s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cmp_lt_u32 s4, 6
-; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
   %trunc = trunc i64 %sel to i32
@@ -104,49 +28,12 @@ define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i
   ret void
 }
 
+; GCN-LABEL: {{^}}select_trunc_i64_2:
+; VI: s_cselect_b32
+; VI-NOT: s_cselect_b32
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
 define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
-; SI-LABEL: select_trunc_i64_2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s8, s[0:1], 0xb
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_gt_u32 s8, 5
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: select_trunc_i64_2:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_gt_u32 s2, 5
-; VI-NEXT:    s_cselect_b32 s2, s4, s6
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: select_trunc_i64_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cmp_gt_u32 s8, 5
-; GFX90A-NEXT:    s_cselect_b32 s0, s4, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
@@ -154,58 +41,12 @@ define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond,
   ret void
 }
 
+; GCN-LABEL: {{^}}v_select_trunc_i64_2:
+; VI: s_cselect_b32
+; VI-NOT: s_cselect_b32
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
 define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-; SI-LABEL: v_select_trunc_i64_2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
-; SI-NEXT:    s_load_dword s2, s[4:5], 0x0
-; SI-NEXT:    s_cmp_gt_u32 s0, 5
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_select_trunc_i64_2:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
-; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_cmp_gt_u32 s2, 5
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cselect_b32 s2, s3, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: v_select_trunc_i64_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_load_dword s9, s[4:5], 0x0
-; GFX90A-NEXT:    s_load_dword s10, s[6:7], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_cmp_gt_u32 s8, 5
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cselect_b32 s0, s9, s10
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90A-NEXT:    s_endpgm
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -215,61 +56,11 @@ define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %con
   ret void
 }
 
+; GCN-LABEL: {{^}}v_select_i64_split_imm:
+; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
+; GCN: s_endpgm
 define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-; SI-LABEL: v_select_i64_split_imm:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_cmp_gt_u32 s6, 5
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e32 v1, 63, v0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; VI-LABEL: v_select_i64_split_imm:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dword s6, s[0:1], 0x2c
-; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    s_mov_b32 s5, 63
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_cmp_gt_u32 s6, 5
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; VI-NEXT:    s_endpgm
-;
-; GFX90A-LABEL: v_select_i64_split_imm:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x2c
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX90A-NEXT:    s_mov_b32 s2, 0
-; GFX90A-NEXT:    s_cmp_gt_u32 s6, 5
-; GFX90A-NEXT:    s_mov_b32 s3, 63
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
index 48127d493fbc..54a26a4cf676 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}selectcc_i64:
 ; EG: XOR_INT
@@ -9,10 +9,9 @@
 ; EG: CNDE_INT
 ; EG: CNDE_INT
 ; SI: v_cmp_eq_u64
-; SI: v_cndmask
-; SI: v_cndmask
 ; VI: s_cmp_eq_u64
-; VI: s_cselect_b64
+; GCN: v_cndmask
+; GCN: v_cndmask
 define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs

diff  --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 651567fe602a..62ae206572b6 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -16,10 +16,10 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32
 
 ; GCN-LABEL: {{^}}sint_to_fp_i1_f64:
 ; VI-DAG: s_cmp_eq_u32
-; VI-DAG: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, -1.0, 0
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0
+; VI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
+; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; VI: s_endpgm
 
 ; SI-DAG: s_cmp_eq_u32

diff  --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index d35af1510218..1f26cd39c4b8 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -76,15 +76,13 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)
 
 ; GCN-LABEL: {{^}}uint_to_fp_i1_to_f64:
 ; VI-DAG: s_cmp_eq_u32
-; VI-DAG: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, 1.0, 0
-; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
+; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
 ; SI-DAG: s_cmp_eq_u32
 ; SI-DAG: s_cselect_b64 vcc, -1, 0
 ; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc
-; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0


        


More information about the llvm-commits mailing list