[llvm] 96db12d - AMDGPU/GlobalISel: Custom lower 32-bit G_UDIV/G_UREM

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 17 11:10:20 PST 2020


Author: Matt Arsenault
Date: 2020-02-17T11:05:50-08:00
New Revision: 96db12d507fbac8b67278775d3234fa9b8178d22

URL: https://github.com/llvm/llvm-project/commit/96db12d507fbac8b67278775d3234fa9b8178d22
DIFF: https://github.com/llvm/llvm-project/commit/96db12d507fbac8b67278775d3234fa9b8178d22.diff

LOG: AMDGPU/GlobalISel: Custom lower 32-bit G_UDIV/G_UREM

AMDGPUCodeGenPrepare expands this most of the time, but not always. We
will always at least need a fallback option here. This is the 3rd
implementation of the same expansion in the backend. Eventually I
would like to eliminate the IR expansion (and the DAG version
obviously).

Currently the new legalizer path produces a better result, since the
IR expansion results in extra operations which need to be combined
out. Notably, the IR expansion results in multiplies by 0.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index dad841731505..4daf9095ce7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -308,7 +308,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   // FIXME: Not really legal. Placeholder for custom lowering.
   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
-    .legalFor({S32, S64})
+    .customFor({S32, S64})
     .clampScalar(0, S32, S64)
     .widenScalarToNextPow2(0, 32)
     .scalarize(0);
@@ -1350,6 +1350,9 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
     return legalizeFMad(MI, MRI, B);
   case TargetOpcode::G_FDIV:
     return legalizeFDIV(MI, MRI, B);
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_UREM:
+    return legalizeUDIV_UREM(MI, MRI, B);
   case TargetOpcode::G_ATOMIC_CMPXCHG:
     return legalizeAtomicCmpXChg(MI, MRI, B);
   case TargetOpcode::G_FLOG:
@@ -2315,6 +2318,122 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
   return false;
 }
 
+static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
+  const LLT S32 = LLT::scalar(32);
+
+  auto Cvt0 = B.buildUITOFP(S32, Src);
+  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
+  auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
+  auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
+  return B.buildFPTOUI(S32, Mul).getReg(0);
+}
+
+bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B) const {
+  B.setInstr(MI);
+  bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
+
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Num = MI.getOperand(1).getReg();
+  Register Den = MI.getOperand(2).getReg();
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  auto RCP = buildDivRCP(B, Den);
+
+  // RCP_LO = mul(RCP, Den)
+  auto RCP_LO = B.buildMul(S32, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  auto RCP_HI = B.buildUMulH(S32, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  auto Zero = B.buildConstant(S32, 0);
+  auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
+  auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
+
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  auto RCP_A_E = B.buildAdd(S32, RCP, E);
+
+  // RCP_S_E = RCP - E
+  auto RCP_S_E = B.buildSub(S32, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
+
+  // Quotient = mulhu(Tmp0, Num)stmp
+  auto Quotient = B.buildUMulH(S32, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = Remainder >= Den
+  auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
+
+  // Remainder_GE_Zero = Num >= Num_S_Remainder;
+  auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
+                                       Num, Num_S_Remainder);
+
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  auto One = B.buildConstant(S32, 1);
+  auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
+
+  // Quotient_S_One = Quotient - 1
+  auto Quotient_S_One = B.buildSub(S32, Quotient, One);
+
+  // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
+  auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
+
+  // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
+  if (IsRem) {
+    Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
+
+    // Calculate Rem result:
+    auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
+
+    // Remainder_A_Den = Remainder + Den
+    auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
+
+    // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
+    auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
+
+    // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
+    B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
+  } else {
+    B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
+                                            MachineRegisterInfo &MRI,
+                                            MachineIRBuilder &B) const {
+  if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
+    return legalizeUDIV_UREM32(MI, MRI, B);
+  return false;
+}
+
 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
                                                  MachineRegisterInfo &MRI,
                                                  MachineIRBuilder &B) const {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 184f4bda62f8..777c34cbd190 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -97,6 +97,11 @@ class AMDGPULegalizerInfo : public LegalizerInfo {
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
+  bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B) const;
+  bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
+
   bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
                     MachineIRBuilder &B) const;
   bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
index 30065daa3bde..8530fa5fb70b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
@@ -144,33 +144,36 @@ body: |
     ; GFX6-LABEL: name: test_sdiv_s16
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s16)
+    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SDIV]](s32)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+    ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-LABEL: name: test_sdiv_s16
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s16)
+    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SDIV]](s32)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+    ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_sdiv_s16
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s16)
+    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SDIV]](s32)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -189,79 +192,39 @@ body: |
     ; GFX6-LABEL: name: test_sdiv_v2s16
     ; GFX6: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX6: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16
-    ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16
-    ; GFX6: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG2]], [[SEXT_INREG3]]
-    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SDIV1]](s32)
-    ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX6: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
+    ; GFX6: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
+    ; GFX6: [[SDIV:%[0-9]+]]:_(s16) = G_SDIV [[UV]], [[UV2]]
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[UV1]](s16)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[UV3]](s16)
+    ; GFX6: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SDIV1]](s32)
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SDIV]](s16), [[TRUNC]](s16)
+    ; GFX6: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX8-LABEL: name: test_sdiv_v2s16
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16
-    ; GFX8: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX8: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16
-    ; GFX8: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG2]], [[SEXT_INREG3]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX8: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SDIV1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
+    ; GFX8: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
+    ; GFX8: [[SDIV:%[0-9]+]]:_(s16) = G_SDIV [[UV]], [[UV2]]
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[UV1]](s16)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[UV3]](s16)
+    ; GFX8: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SDIV1]](s32)
+    ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SDIV]](s16), [[TRUNC]](s16)
+    ; GFX8: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-LABEL: name: test_sdiv_v2s16
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX9: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16
-    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX9: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16
-    ; GFX9: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG2]], [[SEXT_INREG3]]
-    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SDIV1]](s32)
-    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
+    ; GFX9: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
+    ; GFX9: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
+    ; GFX9: [[SDIV:%[0-9]+]]:_(s16) = G_SDIV [[UV]], [[UV2]]
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[UV1]](s16)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[UV3]](s16)
+    ; GFX9: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SDIV1]](s32)
+    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SDIV]](s16), [[TRUNC]](s16)
+    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
     %2:_(<2 x s16>) = G_SDIV %0, %1
@@ -277,33 +240,36 @@ body: |
     ; GFX6-LABEL: name: test_sdiv_s7
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7
-    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s7) = G_TRUNC [[COPY]](s32)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s7) = G_TRUNC [[COPY1]](s32)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s7)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s7)
+    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s7) = G_TRUNC [[SDIV]](s32)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s7)
+    ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-LABEL: name: test_sdiv_s7
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7
-    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s7) = G_TRUNC [[COPY]](s32)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s7) = G_TRUNC [[COPY1]](s32)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s7)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s7)
+    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s7) = G_TRUNC [[SDIV]](s32)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s7)
+    ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_sdiv_s7
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7
-    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s7) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s7) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s7)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s7)
+    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s7) = G_TRUNC [[SDIV]](s32)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s7)
+    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s7) = G_TRUNC %0
@@ -322,33 +288,36 @@ body: |
     ; GFX6-LABEL: name: test_sdiv_s17
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 17
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 17
-    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[COPY]](s32)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s17) = G_TRUNC [[COPY1]](s32)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s17)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s17)
+    ; GFX6: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s17) = G_TRUNC [[SDIV]](s32)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s17)
+    ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-LABEL: name: test_sdiv_s17
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 17
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 17
-    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[COPY]](s32)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s17) = G_TRUNC [[COPY1]](s32)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s17)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s17)
+    ; GFX8: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s17) = G_TRUNC [[SDIV]](s32)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s17)
+    ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_sdiv_s17
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 17
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 17
-    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SDIV]](s32)
-    ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s17) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s17)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s17)
+    ; GFX9: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s17) = G_TRUNC [[SDIV]](s32)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s17)
+    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s17) = G_TRUNC %0
@@ -367,33 +336,36 @@ body: |
     ; GFX6-LABEL: name: test_sdiv_s33
     ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY2]], 33
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY3]], 33
-    ; GFX6: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SDIV]](s64)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s33)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC1]](s33)
+    ; GFX6: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[SDIV]](s64)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX8-LABEL: name: test_sdiv_s33
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY2]], 33
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY3]], 33
-    ; GFX8: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SDIV]](s64)
-    ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s33)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC1]](s33)
+    ; GFX8: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[SDIV]](s64)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX8: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX9-LABEL: name: test_sdiv_s33
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY2]], 33
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY3]], 33
-    ; GFX9: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SDIV]](s64)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s33)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC1]](s33)
+    ; GFX9: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[SDIV]](s64)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s33) = G_TRUNC %0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
index aaf6431f2d93..7b56c3e2ece0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
@@ -144,39 +144,36 @@ body: |
     ; GFX6-LABEL: name: test_srem_s16
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX6: $vgpr0 = COPY [[AND]](s32)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s16)
+    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SREM]](s32)
+    ; GFX6: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC2]](s16)
+    ; GFX6: $vgpr0 = COPY [[ZEXT]](s32)
     ; GFX8-LABEL: name: test_srem_s16
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX8: $vgpr0 = COPY [[AND]](s32)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s16)
+    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SREM]](s32)
+    ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC2]](s16)
+    ; GFX8: $vgpr0 = COPY [[ZEXT]](s32)
     ; GFX9-LABEL: name: test_srem_s16
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX9: $vgpr0 = COPY [[AND]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s16)
+    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SREM]](s32)
+    ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC2]](s16)
+    ; GFX9: $vgpr0 = COPY [[ZEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -195,79 +192,39 @@ body: |
     ; GFX6-LABEL: name: test_srem_v2s16
     ; GFX6: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX6: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16
-    ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16
-    ; GFX6: [[SREM1:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG2]], [[SEXT_INREG3]]
-    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SREM1]](s32)
-    ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX6: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
+    ; GFX6: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
+    ; GFX6: [[SREM:%[0-9]+]]:_(s16) = G_SREM [[UV]], [[UV2]]
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[UV1]](s16)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[UV3]](s16)
+    ; GFX6: [[SREM1:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SREM1]](s32)
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SREM]](s16), [[TRUNC]](s16)
+    ; GFX6: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX8-LABEL: name: test_srem_v2s16
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16
-    ; GFX8: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX8: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16
-    ; GFX8: [[SREM1:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG2]], [[SEXT_INREG3]]
-    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX8: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SREM1]](s32)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
+    ; GFX8: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
+    ; GFX8: [[SREM:%[0-9]+]]:_(s16) = G_SREM [[UV]], [[UV2]]
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[UV1]](s16)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[UV3]](s16)
+    ; GFX8: [[SREM1:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SREM1]](s32)
+    ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SREM]](s16), [[TRUNC]](s16)
+    ; GFX8: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-LABEL: name: test_srem_v2s16
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16
-    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX9: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16
-    ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX9: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16
-    ; GFX9: [[SREM1:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG2]], [[SEXT_INREG3]]
-    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SREM1]](s32)
-    ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
+    ; GFX9: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
+    ; GFX9: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
+    ; GFX9: [[SREM:%[0-9]+]]:_(s16) = G_SREM [[UV]], [[UV2]]
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[UV1]](s16)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[UV3]](s16)
+    ; GFX9: [[SREM1:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SREM1]](s32)
+    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SREM]](s16), [[TRUNC]](s16)
+    ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
     %2:_(<2 x s16>) = G_SREM %0, %1
@@ -283,33 +240,36 @@ body: |
     ; GFX6-LABEL: name: test_srem_s7
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7
-    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s7) = G_TRUNC [[COPY]](s32)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s7) = G_TRUNC [[COPY1]](s32)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s7)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s7)
+    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s7) = G_TRUNC [[SREM]](s32)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s7)
+    ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-LABEL: name: test_srem_s7
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7
-    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s7) = G_TRUNC [[COPY]](s32)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s7) = G_TRUNC [[COPY1]](s32)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s7)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s7)
+    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s7) = G_TRUNC [[SREM]](s32)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s7)
+    ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_srem_s7
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7
-    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s7) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s7) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s7)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s7)
+    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s7) = G_TRUNC [[SREM]](s32)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s7)
+    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s7) = G_TRUNC %0
@@ -328,33 +288,36 @@ body: |
     ; GFX6-LABEL: name: test_srem_s17
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 17
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 17
-    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[COPY]](s32)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s17) = G_TRUNC [[COPY1]](s32)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s17)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s17)
+    ; GFX6: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s17) = G_TRUNC [[SREM]](s32)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s17)
+    ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-LABEL: name: test_srem_s17
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 17
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 17
-    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[COPY]](s32)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s17) = G_TRUNC [[COPY1]](s32)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s17)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s17)
+    ; GFX8: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s17) = G_TRUNC [[SREM]](s32)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s17)
+    ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_srem_s17
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 17
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 17
-    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SREM]](s32)
-    ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[COPY]](s32)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s17) = G_TRUNC [[COPY1]](s32)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s17)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s17)
+    ; GFX9: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s17) = G_TRUNC [[SREM]](s32)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s17)
+    ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s17) = G_TRUNC %0
@@ -373,33 +336,36 @@ body: |
     ; GFX6-LABEL: name: test_srem_s33
     ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY2]], 33
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY3]], 33
-    ; GFX6: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SREM]](s64)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX6: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s33)
+    ; GFX6: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC1]](s33)
+    ; GFX6: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[SREM]](s64)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX8-LABEL: name: test_srem_s33
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY2]], 33
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX8: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY3]], 33
-    ; GFX8: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SREM]](s64)
-    ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX8: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s33)
+    ; GFX8: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC1]](s33)
+    ; GFX8: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[SREM]](s64)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX8: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX9-LABEL: name: test_srem_s33
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX9: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY2]], 33
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX9: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY3]], 33
-    ; GFX9: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[SREM]](s64)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX9: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s33)
+    ; GFX9: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC1]](s33)
+    ; GFX9: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT]], [[SEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[SREM]](s64)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s33) = G_TRUNC %0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
index 3a2e29474118..5a0d3879e55b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
@@ -12,18 +12,93 @@ body: |
     ; GFX6-LABEL: name: test_udiv_s32
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[COPY]], [[COPY1]]
-    ; GFX6: $vgpr0 = COPY [[UDIV]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[COPY1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[COPY1]]
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[COPY]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[COPY1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[COPY1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[MUL1]]
+    ; GFX6: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C2]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX6: $vgpr0 = COPY [[SELECT3]](s32)
     ; GFX8-LABEL: name: test_udiv_s32
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[COPY]], [[COPY1]]
-    ; GFX8: $vgpr0 = COPY [[UDIV]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[COPY1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[COPY1]]
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[COPY]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[COPY1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[COPY1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[MUL1]]
+    ; GFX8: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C2]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX8: $vgpr0 = COPY [[SELECT3]](s32)
     ; GFX9-LABEL: name: test_udiv_s32
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[COPY]], [[COPY1]]
-    ; GFX9: $vgpr0 = COPY [[UDIV]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[COPY1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[COPY1]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[COPY]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[COPY1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[COPY1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[MUL1]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C2]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX9: $vgpr0 = COPY [[SELECT3]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = G_UDIV %0, %1
@@ -41,27 +116,171 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[UV]], [[UV2]]
-    ; GFX6: [[UDIV1:%[0-9]+]]:_(s32) = G_UDIV [[UV1]], [[UV3]]
-    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UDIV]](s32), [[UDIV1]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV2]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[UV2]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[UV2]]
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[UV]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[UV2]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[UV2]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV]](s32), [[MUL1]]
+    ; GFX6: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C2]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV3]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C3]]
+    ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[UV3]]
+    ; GFX6: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[UV3]]
+    ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL2]]
+    ; GFX6: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C1]]
+    ; GFX6: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX6: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX6: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[UV1]]
+    ; GFX6: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[UV3]]
+    ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[MUL3]]
+    ; GFX6: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[UV3]]
+    ; GFX6: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV1]](s32), [[MUL3]]
+    ; GFX6: [[AND1:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH5]], [[C2]]
+    ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[UMULH5]], [[C2]]
+    ; GFX6: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s1), [[UMULH5]], [[ADD3]]
+    ; GFX6: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[SUB7]]
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT3]](s32), [[SELECT7]](s32)
     ; GFX6: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; GFX8-LABEL: name: test_udiv_v2s32
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[UV]], [[UV2]]
-    ; GFX8: [[UDIV1:%[0-9]+]]:_(s32) = G_UDIV [[UV1]], [[UV3]]
-    ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UDIV]](s32), [[UDIV1]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV2]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[UV2]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[UV2]]
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[UV]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[UV2]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[UV2]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV]](s32), [[MUL1]]
+    ; GFX8: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C2]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV3]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C3]]
+    ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[UV3]]
+    ; GFX8: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[UV3]]
+    ; GFX8: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL2]]
+    ; GFX8: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C1]]
+    ; GFX8: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX8: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX8: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX8: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[UV1]]
+    ; GFX8: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[UV3]]
+    ; GFX8: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[MUL3]]
+    ; GFX8: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[UV3]]
+    ; GFX8: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV1]](s32), [[MUL3]]
+    ; GFX8: [[AND1:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX8: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH5]], [[C2]]
+    ; GFX8: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[UMULH5]], [[C2]]
+    ; GFX8: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s1), [[UMULH5]], [[ADD3]]
+    ; GFX8: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[SUB7]]
+    ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT3]](s32), [[SELECT7]](s32)
     ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; GFX9-LABEL: name: test_udiv_v2s32
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[UV]], [[UV2]]
-    ; GFX9: [[UDIV1:%[0-9]+]]:_(s32) = G_UDIV [[UV1]], [[UV3]]
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UDIV]](s32), [[UDIV1]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV2]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[UV2]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[UV2]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[UV]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[UV2]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[UV2]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV]](s32), [[MUL1]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C2]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV3]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C3]]
+    ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[UV3]]
+    ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[UV3]]
+    ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL2]]
+    ; GFX9: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C1]]
+    ; GFX9: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[UV1]]
+    ; GFX9: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[UV3]]
+    ; GFX9: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[MUL3]]
+    ; GFX9: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[UV3]]
+    ; GFX9: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV1]](s32), [[MUL3]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH5]], [[C2]]
+    ; GFX9: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[UMULH5]], [[C2]]
+    ; GFX9: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s1), [[UMULH5]], [[ADD3]]
+    ; GFX9: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[SUB7]]
+    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT3]](s32), [[SELECT7]](s32)
     ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -149,10 +368,35 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
-    ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX6: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; GFX6: $vgpr0 = COPY [[AND3]](s32)
     ; GFX8-LABEL: name: test_udiv_s16
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -161,10 +405,35 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
-    ; GFX8: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX8: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; GFX8: $vgpr0 = COPY [[AND3]](s32)
     ; GFX9-LABEL: name: test_udiv_s16
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -173,10 +442,35 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX9: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; GFX9: $vgpr0 = COPY [[AND3]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -205,18 +499,66 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C3]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C4]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C4]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
     ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
-    ; GFX6: [[UDIV1:%[0-9]+]]:_(s32) = G_UDIV [[AND2]], [[AND3]]
-    ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
-    ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UDIV1]](s32)
-    ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND4]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX6: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C5]]
+    ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[AND4]]
+    ; GFX6: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[AND4]]
+    ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL2]]
+    ; GFX6: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C3]]
+    ; GFX6: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX6: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX6: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[AND3]]
+    ; GFX6: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[AND4]]
+    ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[MUL3]]
+    ; GFX6: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[AND4]]
+    ; GFX6: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND3]](s32), [[MUL3]]
+    ; GFX6: [[AND5:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH5]], [[C4]]
+    ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[UMULH5]], [[C4]]
+    ; GFX6: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s1), [[UMULH5]], [[ADD3]]
+    ; GFX6: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[SUB7]]
+    ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32)
+    ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
     ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; GFX8-LABEL: name: test_udiv_v2s16
@@ -232,18 +574,66 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C3]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C4]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C4]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
     ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; GFX8: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
-    ; GFX8: [[UDIV1:%[0-9]+]]:_(s32) = G_UDIV [[AND2]], [[AND3]]
-    ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
-    ; GFX8: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX8: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UDIV1]](s32)
-    ; GFX8: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX8: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND4]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX8: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C5]]
+    ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[AND4]]
+    ; GFX8: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[AND4]]
+    ; GFX8: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL2]]
+    ; GFX8: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C3]]
+    ; GFX8: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX8: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX8: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX8: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[AND3]]
+    ; GFX8: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[AND4]]
+    ; GFX8: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[MUL3]]
+    ; GFX8: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[AND4]]
+    ; GFX8: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND3]](s32), [[MUL3]]
+    ; GFX8: [[AND5:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX8: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH5]], [[C4]]
+    ; GFX8: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[UMULH5]], [[C4]]
+    ; GFX8: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s1), [[UMULH5]], [[ADD3]]
+    ; GFX8: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[SUB7]]
+    ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX8: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; GFX8: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32)
+    ; GFX8: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; GFX9-LABEL: name: test_udiv_v2s16
@@ -259,14 +649,62 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C3]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C4]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C4]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
-    ; GFX9: [[UDIV1:%[0-9]+]]:_(s32) = G_UDIV [[AND2]], [[AND3]]
-    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
-    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UDIV1]](s32)
+    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND4]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C5]]
+    ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[AND4]]
+    ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[AND4]]
+    ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL2]]
+    ; GFX9: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C3]]
+    ; GFX9: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[AND3]]
+    ; GFX9: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[AND4]]
+    ; GFX9: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[MUL3]]
+    ; GFX9: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[AND4]]
+    ; GFX9: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND3]](s32), [[MUL3]]
+    ; GFX9: [[AND5:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH5]], [[C4]]
+    ; GFX9: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[UMULH5]], [[C4]]
+    ; GFX9: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s1), [[UMULH5]], [[ADD3]]
+    ; GFX9: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[SUB7]]
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -289,8 +727,33 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX8-LABEL: name: test_udiv_s7
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -300,8 +763,33 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX9-LABEL: name: test_udiv_s7
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -311,8 +799,33 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -337,8 +850,33 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX8-LABEL: name: test_udiv_s17
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -348,8 +886,33 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX9-LABEL: name: test_udiv_s17
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -359,8 +922,33 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH2]], [[C3]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMULH2]], [[C3]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[UMULH2]], [[ADD1]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[SUB3]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -380,36 +968,36 @@ body: |
     ; GFX6-LABEL: name: test_udiv_s33
     ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX6: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX6: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX6: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[UDIV]](s64)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX6: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s33)
+    ; GFX6: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s33)
+    ; GFX6: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[ZEXT]], [[ZEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[UDIV]](s64)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX8-LABEL: name: test_udiv_s33
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX8: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX8: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[UDIV]](s64)
-    ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX8: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s33)
+    ; GFX8: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s33)
+    ; GFX8: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[ZEXT]], [[ZEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[UDIV]](s64)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX8: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX9-LABEL: name: test_udiv_s33
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX9: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX9: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[UDIV]](s64)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s33)
+    ; GFX9: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s33)
+    ; GFX9: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[ZEXT]], [[ZEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[UDIV]](s64)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s33) = G_TRUNC %0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
index efaa4f39b190..973cbb61c588 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
@@ -12,18 +12,90 @@ body: |
     ; GFX6-LABEL: name: test_urem_s32
     ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX6: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[COPY]], [[COPY1]]
-    ; GFX6: $vgpr0 = COPY [[UREM]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[COPY1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[COPY1]]
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[COPY]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[COPY1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[COPY1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[MUL1]]
+    ; GFX6: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[COPY1]]
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[COPY1]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[SUB3]], [[SUB2]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX6: $vgpr0 = COPY [[SELECT3]](s32)
     ; GFX8-LABEL: name: test_urem_s32
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[COPY]], [[COPY1]]
-    ; GFX8: $vgpr0 = COPY [[UREM]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[COPY1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[COPY1]]
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[COPY]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[COPY1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[COPY1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[MUL1]]
+    ; GFX8: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[COPY1]]
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[COPY1]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[SUB3]], [[SUB2]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX8: $vgpr0 = COPY [[SELECT3]](s32)
     ; GFX9-LABEL: name: test_urem_s32
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[COPY]], [[COPY1]]
-    ; GFX9: $vgpr0 = COPY [[UREM]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[COPY1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[COPY1]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[COPY]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[COPY1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[COPY1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[MUL1]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[COPY1]]
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[COPY1]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[SUB3]], [[SUB2]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX9: $vgpr0 = COPY [[SELECT3]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = G_UREM %0, %1
@@ -41,27 +113,168 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; GFX6: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[UV]], [[UV2]]
-    ; GFX6: [[UREM1:%[0-9]+]]:_(s32) = G_UREM [[UV1]], [[UV3]]
-    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UREM]](s32), [[UREM1]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV2]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[UV2]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[UV2]]
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[UV]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[UV2]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[UV2]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV]](s32), [[MUL1]]
+    ; GFX6: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[UV2]]
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[UV2]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[SUB3]], [[SUB2]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV3]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
+    ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[UV3]]
+    ; GFX6: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[UV3]]
+    ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL2]]
+    ; GFX6: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C1]]
+    ; GFX6: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX6: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX6: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[UV1]]
+    ; GFX6: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[UV3]]
+    ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[MUL3]]
+    ; GFX6: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[UV3]]
+    ; GFX6: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV1]](s32), [[MUL3]]
+    ; GFX6: [[AND1:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SUB6]], [[UV3]]
+    ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SUB6]], [[UV3]]
+    ; GFX6: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s1), [[SUB7]], [[SUB6]]
+    ; GFX6: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[ADD3]]
+    ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT3]](s32), [[SELECT7]](s32)
     ; GFX6: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; GFX8-LABEL: name: test_urem_v2s32
     ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; GFX8: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[UV]], [[UV2]]
-    ; GFX8: [[UREM1:%[0-9]+]]:_(s32) = G_UREM [[UV1]], [[UV3]]
-    ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UREM]](s32), [[UREM1]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV2]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[UV2]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[UV2]]
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[UV]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[UV2]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[UV2]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV]](s32), [[MUL1]]
+    ; GFX8: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[UV2]]
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[UV2]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[SUB3]], [[SUB2]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV3]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
+    ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[UV3]]
+    ; GFX8: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[UV3]]
+    ; GFX8: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL2]]
+    ; GFX8: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C1]]
+    ; GFX8: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX8: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX8: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX8: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[UV1]]
+    ; GFX8: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[UV3]]
+    ; GFX8: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[MUL3]]
+    ; GFX8: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[UV3]]
+    ; GFX8: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV1]](s32), [[MUL3]]
+    ; GFX8: [[AND1:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX8: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SUB6]], [[UV3]]
+    ; GFX8: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SUB6]], [[UV3]]
+    ; GFX8: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s1), [[SUB7]], [[SUB6]]
+    ; GFX8: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[ADD3]]
+    ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT3]](s32), [[SELECT7]](s32)
     ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; GFX9-LABEL: name: test_urem_v2s32
     ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; GFX9: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[UV]], [[UV2]]
-    ; GFX9: [[UREM1:%[0-9]+]]:_(s32) = G_UREM [[UV1]], [[UV3]]
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UREM]](s32), [[UREM1]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV2]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[UV2]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[UV2]]
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C1]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[UV]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[UV2]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[UV2]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV]](s32), [[MUL1]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[UV2]]
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[UV2]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[SUB3]], [[SUB2]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV3]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
+    ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[UV3]]
+    ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[UV3]]
+    ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[MUL2]]
+    ; GFX9: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C1]]
+    ; GFX9: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[UV1]]
+    ; GFX9: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[UV3]]
+    ; GFX9: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[MUL3]]
+    ; GFX9: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[UV3]]
+    ; GFX9: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[UV1]](s32), [[MUL3]]
+    ; GFX9: [[AND1:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX9: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SUB6]], [[UV3]]
+    ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SUB6]], [[UV3]]
+    ; GFX9: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s1), [[SUB7]], [[SUB6]]
+    ; GFX9: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[ADD3]]
+    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT3]](s32), [[SELECT7]](s32)
     ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -149,10 +362,34 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
-    ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX6: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; GFX6: $vgpr0 = COPY [[AND3]](s32)
     ; GFX8-LABEL: name: test_urem_s16
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -161,10 +398,34 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
-    ; GFX8: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX8: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; GFX8: $vgpr0 = COPY [[AND3]](s32)
     ; GFX9-LABEL: name: test_urem_s16
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -173,10 +434,34 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
-    ; GFX9: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]]
+    ; GFX9: $vgpr0 = COPY [[AND3]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -205,18 +490,65 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX6: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C3]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
     ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
-    ; GFX6: [[UREM1:%[0-9]+]]:_(s32) = G_UREM [[AND2]], [[AND3]]
-    ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
-    ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UREM1]](s32)
-    ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND4]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C4]]
+    ; GFX6: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[AND4]]
+    ; GFX6: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[AND4]]
+    ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL2]]
+    ; GFX6: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C3]]
+    ; GFX6: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX6: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX6: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX6: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[AND3]]
+    ; GFX6: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[AND4]]
+    ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[MUL3]]
+    ; GFX6: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[AND4]]
+    ; GFX6: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND3]](s32), [[MUL3]]
+    ; GFX6: [[AND5:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SUB6]], [[AND4]]
+    ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SUB6]], [[AND4]]
+    ; GFX6: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s1), [[SUB7]], [[SUB6]]
+    ; GFX6: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[ADD3]]
+    ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32)
+    ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
     ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; GFX8-LABEL: name: test_urem_v2s16
@@ -232,18 +564,65 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX8: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C3]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
     ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX8: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; GFX8: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX8: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
-    ; GFX8: [[UREM1:%[0-9]+]]:_(s32) = G_UREM [[AND2]], [[AND3]]
-    ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
-    ; GFX8: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
-    ; GFX8: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UREM1]](s32)
-    ; GFX8: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
-    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX8: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND4]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C4]]
+    ; GFX8: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[AND4]]
+    ; GFX8: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[AND4]]
+    ; GFX8: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL2]]
+    ; GFX8: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C3]]
+    ; GFX8: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX8: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX8: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX8: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX8: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[AND3]]
+    ; GFX8: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[AND4]]
+    ; GFX8: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[MUL3]]
+    ; GFX8: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[AND4]]
+    ; GFX8: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND3]](s32), [[MUL3]]
+    ; GFX8: [[AND5:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX8: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SUB6]], [[AND4]]
+    ; GFX8: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SUB6]], [[AND4]]
+    ; GFX8: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s1), [[SUB7]], [[SUB6]]
+    ; GFX8: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[ADD3]]
+    ; GFX8: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX8: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; GFX8: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32)
+    ; GFX8: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
+    ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
     ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; GFX9-LABEL: name: test_urem_v2s16
@@ -259,14 +638,61 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
-    ; GFX9: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C3]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
-    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
-    ; GFX9: [[UREM1:%[0-9]+]]:_(s32) = G_UREM [[AND2]], [[AND3]]
-    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
-    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UREM1]](s32)
+    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
+    ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND4]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
+    ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C4]]
+    ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
+    ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI1]], [[AND4]]
+    ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[AND4]]
+    ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[MUL2]]
+    ; GFX9: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH3]](s32), [[C3]]
+    ; GFX9: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB4]], [[MUL2]]
+    ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[SELECT4]], [[FPTOUI1]]
+    ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI1]], [[UMULH4]]
+    ; GFX9: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD2]], [[SUB5]]
+    ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[SELECT5]], [[AND3]]
+    ; GFX9: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[AND4]]
+    ; GFX9: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[MUL3]]
+    ; GFX9: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB6]](s32), [[AND4]]
+    ; GFX9: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND3]](s32), [[MUL3]]
+    ; GFX9: [[AND5:%[0-9]+]]:_(s1) = G_AND [[ICMP4]], [[ICMP5]]
+    ; GFX9: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SUB6]], [[AND4]]
+    ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SUB6]], [[AND4]]
+    ; GFX9: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s1), [[SUB7]], [[SUB6]]
+    ; GFX9: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT6]], [[ADD3]]
+    ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
+    ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32)
     ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -289,8 +715,32 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX8-LABEL: name: test_urem_s7
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -300,8 +750,32 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX9-LABEL: name: test_urem_s7
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -311,8 +785,32 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -337,8 +835,32 @@ body: |
     ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX6: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX6: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX6: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX6: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX6: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX8-LABEL: name: test_urem_s17
     ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -348,8 +870,32 @@ body: |
     ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX8: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX8: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX8: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX8: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX8: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX8: $vgpr0 = COPY [[COPY4]](s32)
     ; GFX9-LABEL: name: test_urem_s17
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -359,8 +905,32 @@ body: |
     ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
     ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UREM]](s32)
+    ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32)
+    ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
+    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41F0000000000000
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]]
+    ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
+    ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI]], [[AND1]]
+    ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[AND1]]
+    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[MUL]]
+    ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UMULH]](s32), [[C2]]
+    ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[MUL]]
+    ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[SELECT]], [[FPTOUI]]
+    ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[FPTOUI]], [[UMULH1]]
+    ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[SUB1]]
+    ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[SELECT1]], [[AND]]
+    ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH2]], [[AND1]]
+    ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
+    ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB2]](s32), [[AND1]]
+    ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[AND]](s32), [[MUL1]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s1) = G_AND [[ICMP1]], [[ICMP2]]
+    ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[AND1]]
+    ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB2]], [[AND1]]
+    ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s1), [[SUB3]], [[SUB2]]
+    ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SELECT2]], [[ADD1]]
+    ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32)
     ; GFX9: $vgpr0 = COPY [[COPY4]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
@@ -380,36 +950,36 @@ body: |
     ; GFX6-LABEL: name: test_urem_s33
     ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX6: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
-    ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX6: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
-    ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX6: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY3]], [[C]]
-    ; GFX6: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[AND]], [[AND1]]
-    ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[UREM]](s64)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX6: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX6: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX6: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s33)
+    ; GFX6: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s33)
+    ; GFX6: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[ZEXT]], [[ZEXT1]]
+    ; GFX6: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[UREM]](s64)
+    ; GFX6: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX8-LABEL: name: test_urem_s33
     ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX8: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
-    ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX8: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
-    ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX8: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY3]], [[C]]
-    ; GFX8: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[AND]], [[AND1]]
-    ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[UREM]](s64)
-    ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX8: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX8: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX8: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s33)
+    ; GFX8: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s33)
+    ; GFX8: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[ZEXT]], [[ZEXT1]]
+    ; GFX8: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[UREM]](s64)
+    ; GFX8: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX8: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ; GFX9-LABEL: name: test_urem_s33
     ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
-    ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; GFX9: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]]
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; GFX9: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY3]], [[C]]
-    ; GFX9: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[AND]], [[AND1]]
-    ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[UREM]](s64)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(s33) = G_TRUNC [[COPY]](s64)
+    ; GFX9: [[TRUNC1:%[0-9]+]]:_(s33) = G_TRUNC [[COPY1]](s64)
+    ; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s33)
+    ; GFX9: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s33)
+    ; GFX9: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[ZEXT]], [[ZEXT1]]
+    ; GFX9: [[TRUNC2:%[0-9]+]]:_(s33) = G_TRUNC [[UREM]](s64)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s33)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(s33) = G_TRUNC %0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
new file mode 100644
index 000000000000..e23df92a3030
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -0,0 +1,851 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=CHECK,CGP %s
+
+; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
+
+define i32 @v_udiv_i32(i32 %num, i32 %den) {
+; GISEL-LABEL: v_udiv_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v3, v3, v2
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v2, v0
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 1, v2
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_udiv_i32:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v3, 0, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, 0, v0
+; CGP-NEXT:    v_rcp_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, 0, v5
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v3, 0
+; CGP-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v6, v5
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v5, v3
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v2, v3
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_subrev_i32_e32 v5, vcc, 1, v2
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v0, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; CGP-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %result = udiv i32 %num, %den
+  ret i32 %result
+}
+
+; FIXME: This is a workaround for not handling uniform VGPR case.
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) {
+; GISEL-LABEL: s_udiv_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GISEL-NEXT:    v_mul_lo_u32 v1, v0, s1
+; GISEL-NEXT:    v_mul_hi_u32 v2, v0, s1
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v0
+; GISEL-NEXT:    v_add_i32_e64 v2, s[2:3], v0, v1
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[2:3], v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v0, s0
+; GISEL-NEXT:    v_mul_lo_u32 v1, v0, s1
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v0
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, s0, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v1
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[0:1], s1, v4
+; GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    ; return to shader part epilog
+;
+; CGP-LABEL: s_udiv_i32:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_mov_b32 s4, s1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; CGP-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x200000
+; CGP-NEXT:    s_bfe_u64 s[6:7], s[0:1], 0x200000
+; CGP-NEXT:    v_rcp_f32_e32 v0, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, 0, s2
+; CGP-NEXT:    v_mul_lo_u32 v2, 0, s6
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CGP-NEXT:    v_mul_lo_u32 v3, v0, s2
+; CGP-NEXT:    v_mul_lo_u32 v4, v0, s3
+; CGP-NEXT:    v_mul_hi_u32 v5, v0, s2
+; CGP-NEXT:    v_mul_lo_u32 v6, 0, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v3, v1, 0
+; CGP-NEXT:    v_mul_hi_u32 v1, v1, v0
+; CGP-NEXT:    v_add_i32_e64 v3, s[2:3], v6, v3
+; CGP-NEXT:    v_add_i32_e64 v1, s[2:3], v3, v1
+; CGP-NEXT:    v_add_i32_e64 v3, s[2:3], v0, v1
+; CGP-NEXT:    v_sub_i32_e64 v0, s[2:3], v0, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, s7
+; CGP-NEXT:    v_mul_hi_u32 v0, v0, s6
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, s4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, s0, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v1
+; CGP-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v4
+; CGP-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CGP-NEXT:    v_readfirstlane_b32 s0, v0
+; CGP-NEXT:    ; return to shader part epilog
+  %result = udiv i32 %num, %den
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %result)
+  ret i32 %readlane
+}
+
+define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
+; GISEL-LABEL: v_udiv_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v4, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v5, v7
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v1
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v0, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v3
+; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v7, v4, s[6:7]
+; GISEL-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v10, v5, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_udiv_v2i32:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, 0, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v8, 0, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v1
+; CGP-NEXT:    v_rcp_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v10, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v13, v17, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, 0
+; CGP-NEXT:    v_mul_hi_u32 v8, v8, v7
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v11, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v10, v5
+; CGP-NEXT:    v_add_i32_e64 v8, s[6:7], v11, v8
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v4, v5
+; CGP-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v5
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v7, v8
+; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v7, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v7, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v4, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v1
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, 1, v5
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, v0, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v3
+; CGP-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v4, v7, s[6:7]
+; CGP-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v5, v10, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %result = udiv <2 x i32> %num, %den
+  ret <2 x i32> %result
+}
+
+define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
+; CHECK-LABEL: v_udiv_i32_pow2k_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_movk_i32 s6, 0x1000
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, v1, s6
+; CHECK-NEXT:    v_mul_hi_u32 v3, v1, s6
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v1
+; CHECK-NEXT:    v_add_i32_e64 v3, s[4:5], v1, v2
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v1, v1, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v1, s6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
+; CHECK-NEXT:    v_subrev_i32_e32 v4, vcc, 1, v1
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v5
+; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v3, v1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = udiv i32 %num, 4096
+  ret i32 %result
+}
+
+define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
+; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_movk_i32 s8, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v3, s8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v2
+; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v4
+; CHECK-NEXT:    v_add_i32_e64 v7, s[6:7], v3, v5
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v5
+; CHECK-NEXT:    v_add_i32_e64 v5, s[6:7], v4, v6
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
+; CHECK-NEXT:    v_subrev_i32_e32 v7, vcc, 1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v2
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
+; CHECK-NEXT:    v_subrev_i32_e32 v10, vcc, 1, v4
+; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, v0, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v8
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v8
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v11
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v2
+; CHECK-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v6, v3, s[6:7]
+; CHECK-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, v9, v4, s[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s[4:5]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = udiv <2 x i32> %num, <i32 4096, i32 4096>
+  ret <2 x i32> %result
+}
+
+define i32 @v_udiv_i32_oddk_denom(i32 %num) {
+; CHECK-LABEL: v_udiv_i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, v1, s6
+; CHECK-NEXT:    v_mul_hi_u32 v3, v1, s6
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v1
+; CHECK-NEXT:    v_add_i32_e64 v3, s[4:5], v1, v2
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v1, v1, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v1, s6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
+; CHECK-NEXT:    v_subrev_i32_e32 v4, vcc, 1, v1
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v5
+; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v3, v1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = udiv i32 %num, 1235195
+  ret i32 %result
+}
+
+define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
+; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s8, 0x12d8fb
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v3, s8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v2
+; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v4
+; CHECK-NEXT:    v_add_i32_e64 v7, s[6:7], v3, v5
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v5
+; CHECK-NEXT:    v_add_i32_e64 v5, s[6:7], v4, v6
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
+; CHECK-NEXT:    v_subrev_i32_e32 v7, vcc, 1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v2
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
+; CHECK-NEXT:    v_subrev_i32_e32 v10, vcc, 1, v4
+; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, v0, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v8
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v8
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v11
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v2
+; CHECK-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v6, v3, s[6:7]
+; CHECK-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, v9, v4, s[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s[4:5]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = udiv <2 x i32> %num, <i32 1235195, i32 1235195>
+  ret <2 x i32> %result
+}
+
+define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
+; CHECK-LABEL: v_udiv_i32_pow2_shl_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v1
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CHECK-NEXT:    v_subrev_i32_e32 v5, vcc, 1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v0, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %shl.y = shl i32 4096, %y
+  %r = udiv i32 %x, %shl.y
+  ret i32 %r
+}
+
+define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
+; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_movk_i32 s4, 0x1000
+; GISEL-NEXT:    v_lshl_b32_e32 v2, s4, v2
+; GISEL-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v4, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v5, v7
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v1
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v0, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v3
+; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v7, v4, s[6:7]
+; GISEL-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v10, v5, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    s_movk_i32 s4, 0x1000
+; CGP-NEXT:    v_mul_lo_u32 v4, 0, v0
+; CGP-NEXT:    v_mul_lo_u32 v5, 0, v1
+; CGP-NEXT:    v_lshl_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v7, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v3
+; CGP-NEXT:    v_rcp_f32_e32 v6, v6
+; CGP-NEXT:    v_rcp_f32_e32 v8, v8
+; CGP-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v6
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x4f800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, 0
+; CGP-NEXT:    v_mul_hi_u32 v12, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, 0
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v13
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v10, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, v13, v17, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v10, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v7, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, 0
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v8
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v11, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v10, v7
+; CGP-NEXT:    v_add_i32_e64 v9, s[6:7], v11, v9
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v6, v7
+; CGP-NEXT:    v_sub_i32_e64 v6, s[6:7], v6, v7
+; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v8, v9
+; CGP-NEXT:    v_sub_i32_e64 v8, s[6:7], v8, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v8, v6, 0
+; CGP-NEXT:    v_mul_hi_u32 v6, v6, v0
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v7, v1
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, 1, v5
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, v0, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v3
+; CGP-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v4, v7, s[6:7]
+; CGP-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v5, v10, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
+  %r = udiv <2 x i32> %x, %shl.y
+  ret <2 x i32> %r
+}
+
+define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
+; GISEL-LABEL: v_udiv_i32_24bit:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
+; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v3, v3, v2
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v2, v0
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 1, v2
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_udiv_i32_24bit:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    s_mov_b32 s4, 0xffffff
+; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v3, 0, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, 0, v0
+; CGP-NEXT:    v_rcp_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, 0, v5
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v3, 0
+; CGP-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v6, v5
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v5, v3
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v2, v3
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_subrev_i32_e32 v5, vcc, 1, v2
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v0, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; CGP-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %num.mask = and i32 %num, 16777215
+  %den.mask = and i32 %den, 16777215
+  %result = udiv i32 %num.mask, %den.mask
+  ret i32 %result
+}
+
+define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
+; GISEL-LABEL: v_udiv_v2i32_24bit:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
+; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, s4, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v4, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v5, v7
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v1
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v0, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v3
+; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v7, v4, s[6:7]
+; GISEL-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v10, v5, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_udiv_v2i32_24bit:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    s_mov_b32 s4, 0xffffff
+; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, 0, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v8, 0, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v1
+; CGP-NEXT:    v_rcp_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v10, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v13, v17, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, 0
+; CGP-NEXT:    v_mul_hi_u32 v8, v8, v7
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v11, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v10, v5
+; CGP-NEXT:    v_add_i32_e64 v8, s[6:7], v11, v8
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v4, v5
+; CGP-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v5
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v7, v8
+; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v7, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v7, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v4, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v1
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v5
+; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, 1, v5
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, v0, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v1, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[8:9], v0, v3
+; CGP-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v4, v7, s[6:7]
+; CGP-NEXT:    s_and_b64 s[6:7], s[8:9], s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v5, v10, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
+  %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
+  %result = udiv <2 x i32> %num.mask, %den.mask
+  ret <2 x i32> %result
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
new file mode 100644
index 000000000000..74ba97b6c95d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -0,0 +1,853 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=CHECK,CGP %s
+
+; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
+
+define i32 @v_urem_i32(i32 %num, i32 %den) {
+; GISEL-LABEL: v_urem_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v3, v3, v2
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v2, v0
+; GISEL-NEXT:    v_mul_lo_u32 v2, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; GISEL-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_urem_i32:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v3, 0, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, 0, v0
+; CGP-NEXT:    v_rcp_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, 0, v5
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v3, 0
+; CGP-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v6, v5
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v5, v3
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v2, v3
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CGP-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; CGP-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %result = urem i32 %num, %den
+  ret i32 %result
+}
+
+; FIXME: This is a workaround for not handling uniform VGPR case.
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+define amdgpu_ps i32 @s_urem_i32(i32 inreg %num, i32 inreg %den) {
+; GISEL-LABEL: s_urem_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GISEL-NEXT:    v_mul_lo_u32 v1, v0, s1
+; GISEL-NEXT:    v_mul_hi_u32 v2, v0, s1
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v0
+; GISEL-NEXT:    v_add_i32_e64 v2, s[2:3], v0, v1
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[2:3], v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v0, s0
+; GISEL-NEXT:    v_mul_lo_u32 v0, v0, s1
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
+; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
+; GISEL-NEXT:    v_add_i32_e64 v2, s[2:3], s1, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], s0, v0
+; GISEL-NEXT:    v_subrev_i32_e64 v0, s[2:3], s1, v1
+; GISEL-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    ; return to shader part epilog
+;
+; CGP-LABEL: s_urem_i32:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_mov_b32 s4, s1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; CGP-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x200000
+; CGP-NEXT:    s_bfe_u64 s[6:7], s[0:1], 0x200000
+; CGP-NEXT:    v_rcp_f32_e32 v0, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, 0, s2
+; CGP-NEXT:    v_mul_lo_u32 v2, 0, s6
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CGP-NEXT:    v_mul_lo_u32 v3, v0, s2
+; CGP-NEXT:    v_mul_lo_u32 v4, v0, s3
+; CGP-NEXT:    v_mul_hi_u32 v5, v0, s2
+; CGP-NEXT:    v_mul_lo_u32 v6, 0, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v3, v1, 0
+; CGP-NEXT:    v_mul_hi_u32 v1, v1, v0
+; CGP-NEXT:    v_add_i32_e64 v3, s[2:3], v6, v3
+; CGP-NEXT:    v_add_i32_e64 v1, s[2:3], v3, v1
+; CGP-NEXT:    v_add_i32_e64 v3, s[2:3], v0, v1
+; CGP-NEXT:    v_sub_i32_e64 v0, s[2:3], v0, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, s7
+; CGP-NEXT:    v_mul_hi_u32 v0, v0, s6
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, s4
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
+; CGP-NEXT:    v_add_i32_e64 v2, s[2:3], s4, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[0:1], s0, v0
+; CGP-NEXT:    v_subrev_i32_e64 v0, s[2:3], s4, v1
+; CGP-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; CGP-NEXT:    v_readfirstlane_b32 s0, v0
+; CGP-NEXT:    ; return to shader part epilog
+  %result = urem i32 %num, %den
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %result)
+  ret i32 %readlane
+}
+
+define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
+; GISEL-LABEL: v_urem_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v4, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v5, v7
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v1
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v6, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v6, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; GISEL-NEXT:    v_add_i32_e64 v2, s[8:9], v7, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v5
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[10:11], v7, v3
+; GISEL-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[8:9]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_urem_v2i32:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, 0, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v8, 0, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v1
+; CGP-NEXT:    v_rcp_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v10, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v13, v17, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, 0
+; CGP-NEXT:    v_mul_hi_u32 v8, v8, v7
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v11, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v10, v5
+; CGP-NEXT:    v_add_i32_e64 v8, s[6:7], v11, v8
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v4, v5
+; CGP-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v5
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v7, v8
+; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v7, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v7, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v4, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v1
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; CGP-NEXT:    v_sub_i32_e64 v0, s[6:7], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; CGP-NEXT:    v_add_i32_e64 v2, s[8:9], v7, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v5
+; CGP-NEXT:    v_sub_i32_e64 v1, s[10:11], v7, v3
+; CGP-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; CGP-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[8:9]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %result = urem <2 x i32> %num, %den
+  ret <2 x i32> %result
+}
+
+define i32 @v_urem_i32_pow2k_denom(i32 %num) {
+; CHECK-LABEL: v_urem_i32_pow2k_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_movk_i32 s6, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1000
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, s6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, s6
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s6
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = urem i32 %num, 4096
+  ret i32 %result
+}
+
+define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
+; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_movk_i32 s8, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v3, s8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v2
+; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v4
+; CHECK-NEXT:    v_add_i32_e64 v7, s[6:7], v3, v5
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v5
+; CHECK-NEXT:    v_add_i32_e64 v5, s[6:7], v4, v6
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, s8
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v3
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v1, v4
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[6:7], v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
+; CHECK-NEXT:    v_add_i32_e64 v3, s[8:9], v6, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v4
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[10:11], v6, v2
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[8:9]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = urem <2 x i32> %num, <i32 4096, i32 4096>
+  ret <2 x i32> %result
+}
+
+define i32 @v_urem_i32_oddk_denom(i32 %num) {
+; CHECK-LABEL: v_urem_i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, s6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, s6
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s6
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = urem i32 %num, 1235195
+  ret i32 %result
+}
+
+define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
+; CHECK-LABEL: v_urem_v2i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s8, 0x12d8fb
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v3, s8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v2
+; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v7
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v4
+; CHECK-NEXT:    v_add_i32_e64 v7, s[6:7], v3, v5
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v5
+; CHECK-NEXT:    v_add_i32_e64 v5, s[6:7], v4, v6
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, s8
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v3
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v1, v4
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[6:7], v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
+; CHECK-NEXT:    v_add_i32_e64 v3, s[8:9], v6, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v4
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[10:11], v6, v2
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[8:9]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
+  ret <2 x i32> %result
+}
+
+define i32 @v_urem_i32_pow2_shl_denom(i32 %x, i32 %y) {
+; CHECK-LABEL: v_urem_i32_pow2_shl_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v1
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %shl.y = shl i32 4096, %y
+  %r = urem i32 %x, %shl.y
+  ret i32 %r
+}
+
+define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
+; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_movk_i32 s4, 0x1000
+; GISEL-NEXT:    v_lshl_b32_e32 v2, s4, v2
+; GISEL-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v4, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v5, v7
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v1
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v6, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v6, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; GISEL-NEXT:    v_add_i32_e64 v2, s[8:9], v7, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v5
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[10:11], v7, v3
+; GISEL-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[8:9]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_urem_v2i32_pow2_shl_denom:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    s_movk_i32 s4, 0x1000
+; CGP-NEXT:    v_mul_lo_u32 v4, 0, v0
+; CGP-NEXT:    v_mul_lo_u32 v5, 0, v1
+; CGP-NEXT:    v_lshl_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_lshl_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v7, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v3
+; CGP-NEXT:    v_rcp_f32_e32 v6, v6
+; CGP-NEXT:    v_rcp_f32_e32 v8, v8
+; CGP-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v6
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x4f800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, 0
+; CGP-NEXT:    v_mul_hi_u32 v12, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, 0
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v13
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v10, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, v13, v17, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v10, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v7, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, 0
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v8
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v11, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v10, v7
+; CGP-NEXT:    v_add_i32_e64 v9, s[6:7], v11, v9
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v6, v7
+; CGP-NEXT:    v_sub_i32_e64 v6, s[6:7], v6, v7
+; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v8, v9
+; CGP-NEXT:    v_sub_i32_e64 v8, s[6:7], v8, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v8, v6, 0
+; CGP-NEXT:    v_mul_hi_u32 v6, v6, v0
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v7, v1
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; CGP-NEXT:    v_sub_i32_e64 v0, s[6:7], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; CGP-NEXT:    v_add_i32_e64 v2, s[8:9], v7, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v5
+; CGP-NEXT:    v_sub_i32_e64 v1, s[10:11], v7, v3
+; CGP-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; CGP-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[8:9]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
+  %r = urem <2 x i32> %x, %shl.y
+  ret <2 x i32> %r
+}
+
+define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
+; GISEL-LABEL: v_urem_i32_24bit:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
+; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_mul_hi_u32 v4, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v3, v3, v2
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v2, v3
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v2, v0
+; GISEL-NEXT:    v_mul_lo_u32 v2, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; GISEL-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_urem_i32_24bit:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    s_mov_b32 s4, 0xffffff
+; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v3, 0, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, 0, v0
+; CGP-NEXT:    v_rcp_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v2, v1
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, 0, v5
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v3, 0
+; CGP-NEXT:    v_mul_hi_u32 v3, v3, v2
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v6, v5
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v5, v3
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v2, v3
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, 0
+; CGP-NEXT:    v_mul_hi_u32 v2, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v3, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CGP-NEXT:    v_sub_i32_e64 v0, s[6:7], v3, v1
+; CGP-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %num.mask = and i32 %num, 16777215
+  %den.mask = and i32 %den, 16777215
+  %result = urem i32 %num.mask, %den.mask
+  ret i32 %result
+}
+
+define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
+; GISEL-LABEL: v_urem_v2i32_24bit:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0xffffff
+; GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, s4, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, s4, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v4, v6
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v6
+; GISEL-NEXT:    v_add_i32_e64 v6, s[6:7], v5, v7
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v1
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v6, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v6, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; GISEL-NEXT:    v_add_i32_e64 v2, s[8:9], v7, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v5
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[10:11], v7, v3
+; GISEL-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[8:9]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_urem_v2i32_24bit:
+; CGP:       ; %bb.0:
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT:    s_mov_b32 s4, 0xffffff
+; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
+; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v3, s4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, 0, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, 0, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v8, 0, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, 0, v1
+; CGP-NEXT:    v_rcp_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, 0
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v10, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v13, v17, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, 0
+; CGP-NEXT:    v_mul_hi_u32 v8, v8, v7
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v11, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v10, v5
+; CGP-NEXT:    v_add_i32_e64 v8, s[6:7], v11, v8
+; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v4, v5
+; CGP-NEXT:    v_sub_i32_e64 v4, s[6:7], v4, v5
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v7, v8
+; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v7, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v7, v4, 0
+; CGP-NEXT:    v_mul_hi_u32 v4, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, 0
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v1
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v1, v5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; CGP-NEXT:    v_sub_i32_e64 v0, s[6:7], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; CGP-NEXT:    v_add_i32_e64 v2, s[8:9], v7, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[8:9], v1, v5
+; CGP-NEXT:    v_sub_i32_e64 v1, s[10:11], v7, v3
+; CGP-NEXT:    s_and_b64 vcc, vcc, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; CGP-NEXT:    s_and_b64 vcc, s[6:7], s[8:9]
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[8:9]
+; CGP-NEXT:    s_setpc_b64 s[30:31]
+  %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
+  %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
+  %result = urem <2 x i32> %num.mask, %den.mask
+  ret <2 x i32> %result
+}


        


More information about the llvm-commits mailing list