[llvm] [GlobalISel] Combine [S,U]SUBO (PR #116489)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 16 07:58:39 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Thorsten Schütt (tschuett)
<details>
<summary>Changes</summary>
We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub.
---
Patch is 90.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116489.diff
10 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
- (modified) llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h (+17)
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+15-2)
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp (+72)
- (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+2-2)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir (+46)
- (modified) llvm/test/CodeGen/AArch64/popcount.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+179-179)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+394-394)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+144-147)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b1232a368a3657..55c3b72c8e027f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -939,6 +939,9 @@ class CombinerHelper {
// merge_values(_, zero) -> zext
bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+ // overflow sub
+ bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 4de14dee190fb3..9e5d4d34f24d2b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -486,6 +486,23 @@ class GAddCarryOut : public GBinOpCarryOut {
}
};
+/// Represents overflowing sub operations.
+/// G_USUBO, G_SSUBO
+class GSubCarryOut : public GBinOpCarryOut {
+public:
+ bool isSigned() const { return getOpcode() == TargetOpcode::G_SSUBO; }
+
+ static bool classof(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_USUBO:
+ case TargetOpcode::G_SSUBO:
+ return true;
+ default:
+ return false;
+ }
+ }
+};
+
/// Represents overflowing add/sub operations that also consume a carry-in.
/// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f8379609bf1d98..b4f1551e965b14 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule<
[{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def match_subos : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_SSUBO, G_USUBO):$root,
+ [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
def match_extract_of_element_undef_vector: GICombineRule <
(defs root:$root),
(match (G_IMPLICIT_DEF $vector),
@@ -1901,6 +1907,13 @@ def cmp_combines: GICombineGroup<[
redundant_binop_in_equality
]>;
+
+def artifact_combines: GICombineGroup<[
+ merge_combines,
+ match_addos,
+ match_subos
+]>;
+
// FIXME: These should use the custom predicate feature once it lands.
def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
undef_to_negative_one,
@@ -1984,9 +1997,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
sub_add_reg, select_to_minmax,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
- combine_concat_vector, match_addos,
+ combine_concat_vector,
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
- combine_use_vector_truncate, merge_combines]>;
+ combine_use_vector_truncate, artifact_combines]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 797a1e84e21e35..85c56ee6863a95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -84,3 +84,75 @@ bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
};
return true;
}
+
+bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
+
+ Register Dst = Subo->getReg(0);
+ Register LHS = Subo->getLHSReg();
+ Register RHS = Subo->getRHSReg();
+ Register Carry = Subo->getCarryOutReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT CarryTy = MRI.getType(Carry);
+
+ // Check legality before known bits.
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
+ !isConstantLegalOrBeforeLegalizer(CarryTy))
+ return false;
+
+ if (Subo->isSigned()) {
+ // G_SSUBO
+ ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+ /* IsSigned= */ true);
+ ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+ /* IsSigned= */ true);
+ switch (KBLHS.signedSubMayOverflow(KBRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // G_USUBO
+ ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+ /* IsSigned= */ false);
+ ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+ /* IsSigned= */ false);
+ switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSub(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac4..23563bf9b7881f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -320,7 +320,7 @@ def AArch64PostLegalizerCombiner
hoist_logic_op_with_same_opcode_hands,
redundant_and, xor_of_and_with_same_reg,
extractvecelt_pairwise_add, redundant_or,
- mul_const, redundant_sext_inreg,
+ mul_const, redundant_sext_inreg, artifact_combines,
form_bitfield_extract, rotate_out_of_range,
icmp_to_true_false_known_bits,
select_combines, fold_merge_to_zext,
@@ -328,7 +328,7 @@ def AArch64PostLegalizerCombiner
ptr_add_immed_chain, overlapping_and,
split_store_zero_128, undef_combines,
select_to_minmax, or_to_bsp, combine_concat_vector,
- commute_constant_to_rhs, merge_combines,
+ commute_constant_to_rhs,
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt, combine_use_vector_truncate]> {
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index bc4b5ae7c066a6..87b30f558539c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -176,3 +176,49 @@ body: |
$q1 = COPY %o_wide
RET_ReallyLR implicit $w0
...
+---
+name: sub_may
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: sub_may
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+ ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_SSUBO [[COPY]], %const
+ ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+ ; CHECK-NEXT: $w0 = COPY %sub(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %const:_(s32) = G_CONSTANT i32 512
+ %sub:_(s32), %o:_(s1) = G_SSUBO %0, %const
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %sub(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
+---
+name: usub_may
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: usub_may
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+ ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_USUBO [[COPY]], %const
+ ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+ ; CHECK-NEXT: $w0 = COPY %sub(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %const:_(s32) = G_CONSTANT i32 512
+ %sub:_(s32), %o:_(s1) = G_USUBO %0, %const
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %sub(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 1fc4de1c48b7dd..f9f1cd4b1fcf76 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -113,9 +113,9 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
;
; GISEL-LABEL: popcount256:
; GISEL: // %bb.0: // %Entry
-; GISEL-NEXT: ldp x8, x9, [x0, #16]
+; GISEL-NEXT: ldp x8, x9, [x0]
; GISEL-NEXT: mov v0.d[0], x8
-; GISEL-NEXT: ldp x8, x10, [x0]
+; GISEL-NEXT: ldp x8, x10, [x0, #16]
; GISEL-NEXT: mov v1.d[0], x8
; GISEL-NEXT: mov v0.d[1], x9
; GISEL-NEXT: mov v1.d[1], x10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 613c73f7b9368b..14b30e0d79946c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1178,212 +1178,212 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT: s_subb_u32 s7, 0, 0
+; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v6, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT: v_mov_b32_e32 v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT: v_trunc_f32_e32 v8, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_mov_b32_e32 v8, v5
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT: v_mov_b32_e32 v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT: v_mov_b32_e32 v4, v14
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v17, v18, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/116489
More information about the llvm-commits
mailing list