[llvm-branch-commits] [llvm] 1f9b6ef - GlobalISel: Add combine for G_UREM by power of 2
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 7 13:41:00 PST 2021
Author: Matt Arsenault
Date: 2021-01-07T16:36:35-05:00
New Revision: 1f9b6ef91ffd8ea487aa083d146c7568e7243457
URL: https://github.com/llvm/llvm-project/commit/1f9b6ef91ffd8ea487aa083d146c7568e7243457
DIFF: https://github.com/llvm/llvm-project/commit/1f9b6ef91ffd8ea487aa083d146c7568e7243457.diff
LOG: GlobalISel: Add combine for G_UREM by power of 2
Really I want this in the legalizer, but this is a start.
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 432587ea46c4..0d240e90820f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -400,6 +400,9 @@ class CombinerHelper {
/// Check if operand \p OpIdx is undef.
bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx);
+ /// Check if operand \p OpIdx is known to be a power of 2.
+ bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, unsigned OpIdx);
+
/// Erase \p MI
bool eraseInst(MachineInstr &MI);
@@ -459,6 +462,9 @@ class CombinerHelper {
bool matchPtrAddZero(MachineInstr &MI);
bool applyPtrAddZero(MachineInstr &MI);
+ /// Combine G_UREM x, (known power of 2) to an add and bitmasking.
+ bool applySimplifyURemByPow2(MachineInstr &MI);
+
bool matchCombineInsertVecElts(MachineInstr &MI,
SmallVectorImpl<Register> &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 32aec75af1fa..e352e499d47c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -296,6 +296,13 @@ def binop_left_to_zero: GICombineRule<
(apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
>;
+def urem_pow2_to_mask : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_UREM):$root,
+ [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(*${root}, 2); }]),
+ (apply [{ return Helper.applySimplifyURemByPow2(*${root}); }])
+>;
+
// Fold (x op 0) - > 0
def binop_right_to_zero: GICombineRule<
(defs root:$root),
@@ -560,7 +567,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p]>;
def known_bits_simplifications : GICombineGroup<[
- redundant_and, redundant_sext_inreg, redundant_or]>;
+ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask]>;
def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index abc23da3d418..bbcf32a73fe0 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2580,6 +2580,12 @@ bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) {
getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
}
+bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI,
+ unsigned OpIdx) {
+ MachineOperand &MO = MI.getOperand(OpIdx);
+ return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB);
+}
+
bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
assert(MI.getNumDefs() == 1 && "Expected only one def?");
Builder.setInstr(MI);
@@ -3130,6 +3136,22 @@ bool CombinerHelper::applyPtrAddZero(MachineInstr &MI) {
return true;
}
+/// The second source operand is known to be a power of 2.
+bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Pow2Src1 = MI.getOperand(2).getReg();
+ LLT Ty = MRI.getType(DstReg);
+ Builder.setInstrAndDebugLoc(MI);
+
+ // Fold (urem x, pow2) -> (and x, pow2-1)
+ auto NegOne = Builder.buildConstant(Ty, -1);
+ auto Add = Builder.buildAdd(Ty, Pow2Src1, NegOne);
+ Builder.buildAnd(DstReg, Src0, Add);
+ MI.eraseFromParent();
+ return true;
+}
+
bool CombinerHelper::tryCombine(MachineInstr &MI) {
if (tryCombineCopy(MI))
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir
new file mode 100644
index 000000000000..f92e32dab08f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir
@@ -0,0 +1,156 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: urem_s32_var_const0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: urem_s32_var_const0
+ ; GCN: liveins: $vgpr0
+ ; GCN: %var:_(s32) = COPY $vgpr0
+ ; GCN: %const:_(s32) = G_CONSTANT i32 0
+ ; GCN: %rem:_(s32) = G_UREM %var, %const
+ ; GCN: $vgpr0 = COPY %rem(s32)
+ %var:_(s32) = COPY $vgpr0
+ %const:_(s32) = G_CONSTANT i32 0
+ %rem:_(s32) = G_UREM %var, %const
+ $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s32_var_const1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: urem_s32_var_const1
+ ; GCN: liveins: $vgpr0
+ ; GCN: %const:_(s32) = G_CONSTANT i32 1
+ ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %const, [[C]]
+ ; GCN: $vgpr0 = COPY [[ADD]](s32)
+ %var:_(s32) = COPY $vgpr0
+ %const:_(s32) = G_CONSTANT i32 1
+ %rem:_(s32) = G_UREM %var, %const
+ $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s32_var_const2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: urem_s32_var_const2
+ ; GCN: liveins: $vgpr0
+ ; GCN: %const:_(s32) = G_CONSTANT i32 1
+ ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %const, [[C]]
+ ; GCN: $vgpr0 = COPY [[ADD]](s32)
+ %var:_(s32) = COPY $vgpr0
+ %const:_(s32) = G_CONSTANT i32 1
+ %rem:_(s32) = G_UREM %var, %const
+ $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s32_var_shl1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GCN-LABEL: name: urem_s32_var_shl1
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN: %var:_(s32) = COPY $vgpr0
+ ; GCN: %shift_amt:_(s32) = COPY $vgpr1
+ ; GCN: %one:_(s32) = G_CONSTANT i32 1
+ ; GCN: %one_bit:_(s32) = G_SHL %one, %shift_amt(s32)
+ ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %one_bit, [[C]]
+ ; GCN: %rem:_(s32) = G_AND %var, [[ADD]]
+ ; GCN: $vgpr0 = COPY %rem(s32)
+ %var:_(s32) = COPY $vgpr0
+ %shift_amt:_(s32) = COPY $vgpr1
+ %one:_(s32) = G_CONSTANT i32 1
+ %one_bit:_(s32) = G_SHL %one, %shift_amt
+ %rem:_(s32) = G_UREM %var, %one_bit
+ $vgpr0 = COPY %rem
+...
+
+---
+name: urem_s64_var_shl1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; GCN-LABEL: name: urem_s64_var_shl1
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GCN: %var:_(s64) = COPY $vgpr0_vgpr1
+ ; GCN: %shiftamt:_(s32) = COPY $vgpr2
+ ; GCN: %one:_(s64) = G_CONSTANT i64 1
+ ; GCN: %one_bit:_(s64) = G_SHL %one, %shiftamt(s32)
+ ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; GCN: [[ADD:%[0-9]+]]:_(s64) = G_ADD %one_bit, [[C]]
+ ; GCN: %rem:_(s64) = G_AND %var, [[ADD]]
+ ; GCN: $vgpr0_vgpr1 = COPY %rem(s64)
+ %var:_(s64) = COPY $vgpr0_vgpr1
+ %shiftamt:_(s32) = COPY $vgpr2
+ %one:_(s64) = G_CONSTANT i64 1
+ %one_bit:_(s64) = G_SHL %one, %shiftamt
+ %rem:_(s64) = G_UREM %var, %one_bit
+ $vgpr0_vgpr1 = COPY %rem
+...
+
+---
+name: urem_v2s32_var_shl1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; GCN-LABEL: name: urem_v2s32_var_shl1
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN: %var:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GCN: %shift_amt:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GCN: %one:_(s32) = G_CONSTANT i32 1
+ ; GCN: %one_vec:_(<2 x s32>) = G_BUILD_VECTOR %one(s32), %one(s32)
+ ; GCN: %one_bit:_(<2 x s32>) = G_SHL %one_vec, %shift_amt(<2 x s32>)
+ ; GCN: %rem:_(<2 x s32>) = G_UREM %var, %one_bit
+ ; GCN: $vgpr0_vgpr1 = COPY %rem(<2 x s32>)
+ %var:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ %shift_amt:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ %one:_(s32) = G_CONSTANT i32 1
+ %one_vec:_(<2 x s32>) = G_BUILD_VECTOR %one, %one
+ %one_bit:_(<2 x s32>) = G_SHL %one_vec, %shift_amt
+ %rem:_(<2 x s32>) = G_UREM %var, %one_bit
+ $vgpr0_vgpr1 = COPY %rem
+...
+
+---
+name: urem_v2s16_var_const4_build_vector_trunc
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GCN-LABEL: name: urem_v2s16_var_const4_build_vector_trunc
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN: %var:_(<2 x s16>) = COPY $vgpr0
+ ; GCN: %four:_(s32) = G_CONSTANT i32 4
+ ; GCN: %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four(s32), %four(s32)
+ ; GCN: %rem:_(<2 x s16>) = G_UREM %var, %four_vec
+ ; GCN: $vgpr0 = COPY %rem(<2 x s16>)
+ %var:_(<2 x s16>) = COPY $vgpr0
+ %shift_amt:_(<2 x s16>) = COPY $vgpr1
+ %four:_(s32) = G_CONSTANT i32 4
+ %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four, %four
+ %rem:_(<2 x s16>) = G_UREM %var, %four_vec
+ $vgpr0 = COPY %rem
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 7850d42f2166..e6bee5ee92f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -207,24 +207,8 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
; CHECK-LABEL: v_urem_i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s4, 0x1000
-; CHECK-NEXT: v_mov_b32_e32 v1, 0xfffff000
-; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2
-; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT: v_lshlrev_b32_e32 v1, 12, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: s_add_i32 s4, 0x1000, -1
+; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = urem i32 %num, 4096
ret i32 %result
@@ -266,42 +250,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
; CGP-LABEL: v_urem_v2i32_pow2k_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: s_movk_i32 s4, 0x1000
-; CGP-NEXT: v_mov_b32_e32 v2, 0x1000
-; CGP-NEXT: s_mov_b32 s5, 0x4f7ffffe
-; CGP-NEXT: s_movk_i32 s6, 0xf000
-; CGP-NEXT: v_cvt_f32_u32_e32 v3, s4
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT: v_mul_f32_e32 v3, s5, v3
-; CGP-NEXT: v_mul_f32_e32 v4, s5, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_mul_lo_u32 v5, s6, v3
-; CGP-NEXT: v_mul_lo_u32 v6, s6, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT: v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3
-; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CGP-NEXT: s_add_i32 s4, 0x1000, -1
+; CGP-NEXT: v_and_b32_e32 v0, s4, v0
+; CGP-NEXT: v_and_b32_e32 v1, s4, v1
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = urem <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2e1292d9dc65..60084a08a1fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -949,131 +949,13 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) {
; CHECK-LABEL: v_urem_i64_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
-; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: s_movk_i32 s6, 0xf000
-; CHECK-NEXT: s_movk_i32 s7, 0x1000
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v3, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3
-; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4
-; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4
-; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4
-; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc
-; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4
-; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4
-; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4
-; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6
-; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6
-; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6
-; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v9
-; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v1, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, v0, v3
-; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3
-; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, 0, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_mul_lo_u32 v3, s7, v3
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s7, v0
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s7, v4
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: s_add_u32 s4, 0x1000, -1
+; CHECK-NEXT: s_cselect_b32 s5, 1, 0
+; CHECK-NEXT: s_and_b32 s5, s5, 1
+; CHECK-NEXT: s_cmp_lg_u32 s5, 0
+; CHECK-NEXT: s_addc_u32 s5, 0, -1
+; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT: v_and_b32_e32 v1, s5, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = urem i64 %num, 4096
ret i64 %result
@@ -1344,253 +1226,21 @@ define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-LABEL: v_urem_v2i64_pow2k_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000
-; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT: s_movk_i32 s8, 0xf000
-; CGP-NEXT: s_movk_i32 s10, 0x1000
-; CGP-NEXT: v_mov_b32_e32 v6, v4
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5
-; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6
-; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
-; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
-; CGP-NEXT: v_trunc_f32_e32 v6, v6
-; CGP-NEXT: v_trunc_f32_e32 v7, v7
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_mul_lo_u32 v8, s8, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT: v_mul_lo_u32 v9, s8, v7
-; CGP-NEXT: v_mul_lo_u32 v10, s8, v4
-; CGP-NEXT: v_mul_lo_u32 v11, -1, v4
-; CGP-NEXT: v_mul_hi_u32 v12, s8, v4
-; CGP-NEXT: v_mul_lo_u32 v13, s8, v5
-; CGP-NEXT: v_mul_lo_u32 v14, -1, v5
-; CGP-NEXT: v_mul_hi_u32 v15, s8, v5
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; CGP-NEXT: v_mul_lo_u32 v11, v6, v10
-; CGP-NEXT: v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v6, v10
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v5, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v13
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT: v_mul_lo_u32 v12, v4, v8
-; CGP-NEXT: v_mul_lo_u32 v15, v6, v8
-; CGP-NEXT: v_mul_hi_u32 v18, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v19, v5, v9
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v17, v5, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc
-; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8
-; CGP-NEXT: v_mul_lo_u32 v8, s8, v4
-; CGP-NEXT: v_mul_lo_u32 v11, -1, v4
-; CGP-NEXT: v_mul_hi_u32 v12, s8, v4
-; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13
-; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v7, v9
-; CGP-NEXT: v_mul_lo_u32 v9, s8, v5
-; CGP-NEXT: v_mul_lo_u32 v14, -1, v5
-; CGP-NEXT: v_mul_hi_u32 v15, s8, v5
-; CGP-NEXT: v_mul_lo_u32 v16, s8, v10
-; CGP-NEXT: v_mul_lo_u32 v17, v10, v8
-; CGP-NEXT: v_mul_hi_u32 v18, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v10, v8
-; CGP-NEXT: v_mul_lo_u32 v19, s8, v13
-; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v9
-; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v19
-; CGP-NEXT: v_mul_hi_u32 v19, v5, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v9
-; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12
-; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v14, v15
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v11
-; CGP-NEXT: v_mul_lo_u32 v15, v5, v12
-; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19
-; CGP-NEXT: v_mul_lo_u32 v15, v10, v11
-; CGP-NEXT: v_mul_hi_u32 v19, v4, v11
-; CGP-NEXT: v_mul_hi_u32 v10, v10, v11
-; CGP-NEXT: v_mul_lo_u32 v11, v13, v12
-; CGP-NEXT: v_mul_hi_u32 v13, v13, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v5, v12
-; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v15, v8
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9]
-; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v11, v9
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9]
-; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9]
-; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v8, v19
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
-; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14
-; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18
-; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v19
-; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12
-; CGP-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
-; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v15, v12
-; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14
-; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12
-; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v13, v11
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc
-; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v11, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v2, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v1, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v0, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v2, v6
-; CGP-NEXT: v_mul_lo_u32 v13, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v14, v2, v6
-; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT: v_mul_lo_u32 v15, v0, v7
-; CGP-NEXT: v_mul_lo_u32 v16, v1, v7
-; CGP-NEXT: v_mul_hi_u32 v17, v0, v7
-; CGP-NEXT: v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_mul_lo_u32 v10, s10, v4
-; CGP-NEXT: v_mul_lo_u32 v12, 0, v4
-; CGP-NEXT: v_mul_hi_u32 v4, s10, v4
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v11, s10, v5
-; CGP-NEXT: v_mul_lo_u32 v13, 0, v5
-; CGP-NEXT: v_mul_hi_u32 v5, s10, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v6, s10, v6
-; CGP-NEXT: v_mul_lo_u32 v7, s10, v7
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s10, v2
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11
-; CGP-NEXT: v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v5
-; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7]
-; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], 0, v6
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
-; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
-; CGP-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
-; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; CGP-NEXT: v_subrev_i32_e32 v8, vcc, s10, v2
-; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, s10, v8
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
-; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s10, v0
-; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, s10, v9
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
-; CGP-NEXT: v_subrev_i32_e32 v13, vcc, s10, v8
-; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT: v_subrev_i32_e32 v16, vcc, s10, v9
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc
-; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11
-; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
-; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5]
-; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5]
-; CGP-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5]
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; CGP-NEXT: s_movk_i32 s4, 0x1000
+; CGP-NEXT: s_add_u32 s5, s4, -1
+; CGP-NEXT: s_cselect_b32 s6, 1, 0
+; CGP-NEXT: s_and_b32 s6, s6, 1
+; CGP-NEXT: s_cmp_lg_u32 s6, 0
+; CGP-NEXT: s_addc_u32 s6, 0, -1
+; CGP-NEXT: s_add_u32 s4, s4, -1
+; CGP-NEXT: s_cselect_b32 s7, 1, 0
+; CGP-NEXT: v_and_b32_e32 v0, s5, v0
+; CGP-NEXT: s_and_b32 s5, s7, 1
+; CGP-NEXT: v_and_b32_e32 v1, s6, v1
+; CGP-NEXT: s_cmp_lg_u32 s5, 0
+; CGP-NEXT: s_addc_u32 s5, 0, -1
+; CGP-NEXT: v_and_b32_e32 v2, s4, v2
+; CGP-NEXT: v_and_b32_e32 v3, s5, v3
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = urem <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
More information about the llvm-branch-commits
mailing list