[llvm] 1f9b6ef - GlobalISel: Add combine for G_UREM by power of 2

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 7 13:36:42 PST 2021


Author: Matt Arsenault
Date: 2021-01-07T16:36:35-05:00
New Revision: 1f9b6ef91ffd8ea487aa083d146c7568e7243457

URL: https://github.com/llvm/llvm-project/commit/1f9b6ef91ffd8ea487aa083d146c7568e7243457
DIFF: https://github.com/llvm/llvm-project/commit/1f9b6ef91ffd8ea487aa083d146c7568e7243457.diff

LOG: GlobalISel: Add combine for G_UREM by power of 2

Really I want this in the legalizer, but this is a start.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 432587ea46c4..0d240e90820f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -400,6 +400,9 @@ class CombinerHelper {
   /// Check if operand \p OpIdx is undef.
   bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx);
 
+  /// Check if operand \p OpIdx is known to be a power of 2.
+  bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, unsigned OpIdx);
+
   /// Erase \p MI
   bool eraseInst(MachineInstr &MI);
 
@@ -459,6 +462,9 @@ class CombinerHelper {
   bool matchPtrAddZero(MachineInstr &MI);
   bool applyPtrAddZero(MachineInstr &MI);
 
+  /// Combine G_UREM x, (known power of 2) to an add and bitmasking.
+  bool applySimplifyURemByPow2(MachineInstr &MI);
+
   bool matchCombineInsertVecElts(MachineInstr &MI,
                                  SmallVectorImpl<Register> &MatchInfo);
 

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 32aec75af1fa..e352e499d47c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -296,6 +296,13 @@ def binop_left_to_zero: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
 >;
 
+def urem_pow2_to_mask : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UREM):$root,
+    [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(*${root}, 2); }]),
+  (apply [{ return Helper.applySimplifyURemByPow2(*${root}); }])
+>;
+
 // Fold (x op 0) - > 0
 def binop_right_to_zero: GICombineRule<
   (defs root:$root),
@@ -560,7 +567,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
 def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p]>;
 
 def known_bits_simplifications : GICombineGroup<[
-  redundant_and, redundant_sext_inreg, redundant_or]>;
+  redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask]>;
 
 def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>;
 

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index abc23da3d418..bbcf32a73fe0 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2580,6 +2580,12 @@ bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) {
          getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
 }
 
+bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI,
+                                                        unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB);
+}
+
 bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.setInstr(MI);
@@ -3130,6 +3136,22 @@ bool CombinerHelper::applyPtrAddZero(MachineInstr &MI) {
   return true;
 }
 
+/// The second source operand is known to be a power of 2.
+bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Pow2Src1 = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  Builder.setInstrAndDebugLoc(MI);
+
+  // Fold (urem x, pow2) -> (and x, pow2-1)
+  auto NegOne = Builder.buildConstant(Ty, -1);
+  auto Add = Builder.buildAdd(Ty, Pow2Src1, NegOne);
+  Builder.buildAnd(DstReg, Src0, Add);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir
new file mode 100644
index 000000000000..f92e32dab08f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir
@@ -0,0 +1,156 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: urem_s32_var_const0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: urem_s32_var_const0
+    ; GCN: liveins: $vgpr0
+    ; GCN: %var:_(s32) = COPY $vgpr0
+    ; GCN: %const:_(s32) = G_CONSTANT i32 0
+    ; GCN: %rem:_(s32) = G_UREM %var, %const
+    ; GCN: $vgpr0 = COPY %rem(s32)
+    %var:_(s32) = COPY $vgpr0
+    %const:_(s32) = G_CONSTANT i32 0
+    %rem:_(s32) = G_UREM %var, %const
+    $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s32_var_const1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: urem_s32_var_const1
+    ; GCN: liveins: $vgpr0
+    ; GCN: %const:_(s32) = G_CONSTANT i32 1
+    ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %const, [[C]]
+    ; GCN: $vgpr0 = COPY [[ADD]](s32)
+    %var:_(s32) = COPY $vgpr0
+    %const:_(s32) = G_CONSTANT i32 1
+    %rem:_(s32) = G_UREM %var, %const
+    $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s32_var_const2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: urem_s32_var_const2
+    ; GCN: liveins: $vgpr0
+    ; GCN: %const:_(s32) = G_CONSTANT i32 1
+    ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %const, [[C]]
+    ; GCN: $vgpr0 = COPY [[ADD]](s32)
+    %var:_(s32) = COPY $vgpr0
+    %const:_(s32) = G_CONSTANT i32 1
+    %rem:_(s32) = G_UREM %var, %const
+    $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s32_var_shl1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GCN-LABEL: name: urem_s32_var_shl1
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN: %var:_(s32) = COPY $vgpr0
+    ; GCN: %shift_amt:_(s32) = COPY $vgpr1
+    ; GCN: %one:_(s32) = G_CONSTANT i32 1
+    ; GCN: %one_bit:_(s32) = G_SHL %one, %shift_amt(s32)
+    ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %one_bit, [[C]]
+    ; GCN: %rem:_(s32) = G_AND %var, [[ADD]]
+    ; GCN: $vgpr0 = COPY %rem(s32)
+    %var:_(s32) = COPY $vgpr0
+    %shift_amt:_(s32) = COPY $vgpr1
+    %one:_(s32) = G_CONSTANT i32 1
+    %one_bit:_(s32) = G_SHL %one, %shift_amt
+    %rem:_(s32) = G_UREM %var, %one_bit
+    $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s64_var_shl1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; GCN-LABEL: name: urem_s64_var_shl1
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GCN: %var:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN: %shiftamt:_(s32) = COPY $vgpr2
+    ; GCN: %one:_(s64) = G_CONSTANT i64 1
+    ; GCN: %one_bit:_(s64) = G_SHL %one, %shiftamt(s32)
+    ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; GCN: [[ADD:%[0-9]+]]:_(s64) = G_ADD %one_bit, [[C]]
+    ; GCN: %rem:_(s64) = G_AND %var, [[ADD]]
+    ; GCN: $vgpr0_vgpr1 = COPY %rem(s64)
+    %var:_(s64) = COPY $vgpr0_vgpr1
+    %shiftamt:_(s32) = COPY $vgpr2
+    %one:_(s64) = G_CONSTANT i64 1
+    %one_bit:_(s64) = G_SHL %one, %shiftamt
+    %rem:_(s64) = G_UREM %var, %one_bit
+    $vgpr0_vgpr1 = COPY %rem
+...
+
+---
+name: urem_v2s32_var_shl1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GCN-LABEL: name: urem_v2s32_var_shl1
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GCN: %var:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; GCN: %shift_amt:_(<2 x s32>) = COPY $vgpr2_vgpr3
+    ; GCN: %one:_(s32) = G_CONSTANT i32 1
+    ; GCN: %one_vec:_(<2 x s32>) = G_BUILD_VECTOR %one(s32), %one(s32)
+    ; GCN: %one_bit:_(<2 x s32>) = G_SHL %one_vec, %shift_amt(<2 x s32>)
+    ; GCN: %rem:_(<2 x s32>) = G_UREM %var, %one_bit
+    ; GCN: $vgpr0_vgpr1 = COPY %rem(<2 x s32>)
+    %var:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    %shift_amt:_(<2 x s32>) = COPY $vgpr2_vgpr3
+    %one:_(s32) = G_CONSTANT i32 1
+    %one_vec:_(<2 x s32>) = G_BUILD_VECTOR %one, %one
+    %one_bit:_(<2 x s32>) = G_SHL %one_vec, %shift_amt
+    %rem:_(<2 x s32>) = G_UREM %var, %one_bit
+    $vgpr0_vgpr1 = COPY %rem
+...
+
+---
+name: urem_v2s16_var_const4_build_vector_trunc
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GCN-LABEL: name: urem_v2s16_var_const4_build_vector_trunc
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN: %var:_(<2 x s16>) = COPY $vgpr0
+    ; GCN: %four:_(s32) = G_CONSTANT i32 4
+    ; GCN: %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four(s32), %four(s32)
+    ; GCN: %rem:_(<2 x s16>) = G_UREM %var, %four_vec
+    ; GCN: $vgpr0 = COPY %rem(<2 x s16>)
+    %var:_(<2 x s16>) = COPY $vgpr0
+    %shift_amt:_(<2 x s16>) = COPY $vgpr1
+    %four:_(s32) = G_CONSTANT i32 4
+    %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four, %four
+    %rem:_(<2 x s16>) = G_UREM %var, %four_vec
+    $vgpr0 = COPY %rem
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 7850d42f2166..e6bee5ee92f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -207,24 +207,8 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x1000
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0xfffff000
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v1, v2, v1
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_add_i32 s4, 0x1000, -1
+; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 4096
   ret i32 %result
@@ -266,42 +250,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_urem_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s4, 0x1000
-; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
-; CGP-NEXT:    s_mov_b32 s5, 0x4f7ffffe
-; CGP-NEXT:    s_movk_i32 s6, 0xf000
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_f32_e32 v3, s5, v3
-; CGP-NEXT:    v_mul_f32_e32 v4, s5, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v5, s6, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v4, 12, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v1, v2
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v1, v2
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CGP-NEXT:    s_add_i32 s4, 0x1000, -1
+; CGP-NEXT:    v_and_b32_e32 v0, s4, v0
+; CGP-NEXT:    v_and_b32_e32 v1, s4, v1
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2e1292d9dc65..60084a08a1fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -949,131 +949,13 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_movk_i32 s6, 0xf000
-; CHECK-NEXT:    s_movk_i32 s7, 0x1000
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v5, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, s6, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_addc_u32_e64 v5, s[4:5], v3, v4, vcc
-; CHECK-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v4
-; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v5, v4
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v6
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v6
-; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v6
-; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v8, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v9
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, s7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, 0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, s7, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_mul_lo_u32 v3, s7, v3
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v1, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], 0, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v4, vcc, s7, v0
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v7, vcc, s7, v4
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    s_add_u32 s4, 0x1000, -1
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_addc_u32 s5, 0, -1
+; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, s5, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 4096
   ret i64 %result
@@ -1344,253 +1226,21 @@ define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-LABEL: v_urem_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_movk_i32 s8, 0xf000
-; CGP-NEXT:    s_movk_i32 s10, 0x1000
-; CGP-NEXT:    v_mov_b32_e32 v6, v4
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v6
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, s8, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s8, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, s8, v4
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, s8, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, -1, v5
-; CGP-NEXT:    v_mul_hi_u32 v15, s8, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v6, v10
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v5, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v8
-; CGP-NEXT:    v_mul_lo_u32 v15, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v18, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v19, v5, v9
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v17, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v15, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v16
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v8, vcc
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v8, s8, v4
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, s8, v4
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, -1, v5
-; CGP-NEXT:    v_mul_hi_u32 v15, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v16, s8, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v10, v8
-; CGP-NEXT:    v_mul_hi_u32 v18, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v10, v8
-; CGP-NEXT:    v_mul_lo_u32 v19, s8, v13
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v16
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v9
-; CGP-NEXT:    v_add_i32_e64 v14, s[6:7], v14, v19
-; CGP-NEXT:    v_mul_hi_u32 v19, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v12
-; CGP-NEXT:    v_add_i32_e64 v12, s[6:7], v14, v15
-; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_lo_u32 v15, v5, v12
-; CGP-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v15, s[6:7], v15, v19
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v11
-; CGP-NEXT:    v_mul_hi_u32 v19, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v5, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[8:9], v17, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v8, s[8:9], v15, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v9, s[8:9], v11, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v14, s[8:9], v14, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v8, s[8:9], v8, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v14, s[6:7], v17, v14
-; CGP-NEXT:    v_add_i32_e64 v15, s[6:7], v15, v18
-; CGP-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v19
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v12
-; CGP-NEXT:    v_add_i32_e64 v8, s[6:7], v8, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v12, s[6:7], v15, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v14
-; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v10, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v13, v11
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
-; CGP-NEXT:    v_addc_u32_e64 v7, vcc, v7, v11, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v0, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v2, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v0, v7
-; CGP-NEXT:    v_mul_lo_u32 v16, v1, v7
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, s10, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, 0, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, s10, v4
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v11, s10, v5
-; CGP-NEXT:    v_mul_lo_u32 v13, 0, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, s10, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v6, s10, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, s10, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s10, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v11
-; CGP-NEXT:    v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v5
-; CGP-NEXT:    v_cmp_le_u32_e64 s[6:7], s10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
-; CGP-NEXT:    v_cmp_le_u32_e64 s[6:7], 0, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; CGP-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, s10, v2
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s10, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v9, vcc, s10, v0
-; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s10, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v13, vcc, s10, v8
-; CGP-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v16, vcc, s10, v9
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v11, v15, v11, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, v9, v16, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; CGP-NEXT:    s_movk_i32 s4, 0x1000
+; CGP-NEXT:    s_add_u32 s5, s4, -1
+; CGP-NEXT:    s_cselect_b32 s6, 1, 0
+; CGP-NEXT:    s_and_b32 s6, s6, 1
+; CGP-NEXT:    s_cmp_lg_u32 s6, 0
+; CGP-NEXT:    s_addc_u32 s6, 0, -1
+; CGP-NEXT:    s_add_u32 s4, s4, -1
+; CGP-NEXT:    s_cselect_b32 s7, 1, 0
+; CGP-NEXT:    v_and_b32_e32 v0, s5, v0
+; CGP-NEXT:    s_and_b32 s5, s7, 1
+; CGP-NEXT:    v_and_b32_e32 v1, s6, v1
+; CGP-NEXT:    s_cmp_lg_u32 s5, 0
+; CGP-NEXT:    s_addc_u32 s5, 0, -1
+; CGP-NEXT:    v_and_b32_e32 v2, s4, v2
+; CGP-NEXT:    v_and_b32_e32 v3, s5, v3
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result


        


More information about the llvm-commits mailing list