[llvm] [GlobaISel] Allow expanding of sdiv -> mul by constant (PR #146504)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 1 03:35:41 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (jyli0116)
<details>
<summary>Changes</summary>
Allows expand of sdiv->mul by constant combine for the general case. Previously this was only occurring in the exact case. This is part of the resolution to issue #<!-- -->118090
---
Patch is 115.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146504.diff
12 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+4)
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+2-2)
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+114-20)
- (added) llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll (+1663)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir (+8-3)
- (modified) llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll (+6-3)
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll (+24-90)
- (modified) llvm/test/CodeGen/AArch64/select_const.ll (+3-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+48-77)
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll (+407-104)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index c15263e0b06f8..5568139af98d2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -143,6 +143,10 @@ class CombinerHelper {
/// Query is legal on the target.
bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const;
+ /// \return true if \p Query is legal on the target, or if \p Query will
+ /// perform WidenScalar action on the target.
+ bool isLegalorHasWidenScalar(const LegalityQuery &Query) const;
+
/// \return true if the combine is running prior to legalization, or if \p Ty
/// is a legal integer constant type on the target.
bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4a92dc16c1bf4..347a5f85affb1 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -2046,9 +2046,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift,
form_bitfield_extract, constant_fold_binops, constant_fold_fma,
constant_fold_cast_op, fabs_fneg_fold,
- intdiv_combines, mulh_combines, redundant_neg_operands,
+ mulh_combines, redundant_neg_operands,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
- sub_add_reg, select_to_minmax,
+ intdiv_combines, sub_add_reg, select_to_minmax,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
simplify_neg_minmax, combine_concat_vector,
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 05dd269d48921..b866982faa1d2 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -162,6 +162,11 @@ bool CombinerHelper::isLegalOrBeforeLegalizer(
return isPreLegalize() || isLegal(Query);
}
+bool CombinerHelper::isLegalorHasWidenScalar(const LegalityQuery &Query) const {
+ return isLegal(Query) ||
+ LI->getAction(Query).Action == LegalizeActions::WidenScalar;
+}
+
bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const {
if (!Ty.isVector())
return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}});
@@ -5510,6 +5515,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
Register RHS = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
+ auto SizeInBits = DstTy.getScalarSizeInBits();
+ LLT WideTy = DstTy.changeElementSize(SizeInBits * 2);
auto &MF = *MI.getMF();
AttributeList Attr = MF.getFunction().getAttributes();
@@ -5529,8 +5536,21 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
- // Don't support the general case for now.
- return false;
+ auto *RHSDef = MRI.getVRegDef(RHS);
+ if (!isConstantOrConstantVector(*RHSDef, MRI))
+ return false;
+
+ // Don't do this if the types are not going to be legal.
+ if (LI) {
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
+ return false;
+ if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
+ !isLegalorHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
+ return false;
+ }
+
+ return matchUnaryPredicate(
+ MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
@@ -5546,21 +5566,22 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
Register RHS = SDiv.getReg(2);
LLT Ty = MRI.getType(Dst);
LLT ScalarTy = Ty.getScalarType();
+ const unsigned EltBits = ScalarTy.getScalarSizeInBits();
LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
auto &MIB = Builder;
bool UseSRA = false;
- SmallVector<Register, 16> Shifts, Factors;
+ SmallVector<Register, 16> ExactShifts, ExactFactors;
- auto *RHSDef = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
- bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).has_value();
+ auto *RHSDefInstr = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
+ bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value();
- auto BuildSDIVPattern = [&](const Constant *C) {
+ auto BuildExactSDIVPattern = [&](const Constant *C) {
// Don't recompute inverses for each splat element.
- if (IsSplat && !Factors.empty()) {
- Shifts.push_back(Shifts[0]);
- Factors.push_back(Factors[0]);
+ if (IsSplat && !ExactFactors.empty()) {
+ ExactShifts.push_back(ExactShifts[0]);
+ ExactFactors.push_back(ExactFactors[0]);
return true;
}
@@ -5575,31 +5596,104 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
// Calculate the multiplicative inverse modulo BW.
// 2^W requires W + 1 bits, so we have to extend and then truncate.
APInt Factor = Divisor.multiplicativeInverse();
- Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
- Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
+ ExactShifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
+ ExactFactors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
return true;
};
- // Collect all magic values from the build vector.
+ if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+ // Collect all magic values from the build vector.
+ bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactSDIVPattern);
+ (void)Matched;
+ assert(Matched && "Expected unary predicate match to succeed");
+
+ Register Shift, Factor;
+ if (Ty.isVector()) {
+ Shift = MIB.buildBuildVector(ShiftAmtTy, ExactShifts).getReg(0);
+ Factor = MIB.buildBuildVector(Ty, ExactFactors).getReg(0);
+ } else {
+ Shift = ExactShifts[0];
+ Factor = ExactFactors[0];
+ }
+
+ Register Res = LHS;
+
+ if (UseSRA)
+ Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+
+ return MIB.buildMul(Ty, Res, Factor);
+ }
+
+ SmallVector<Register, 16> MagicFactors, Factors, Shifts, ShiftMasks;
+
+ auto BuildSDIVPattern = [&](const Constant *C) {
+ auto *CI = cast<ConstantInt>(C);
+ const APInt &Divisor = CI->getValue();
+
+ SignedDivisionByConstantInfo magics =
+ SignedDivisionByConstantInfo::get(Divisor);
+ int NumeratorFactor = 0;
+ int ShiftMask = -1;
+
+ if (Divisor.isOne() || Divisor.isAllOnes()) {
+ // If d is +1/-1, we just multiply the numerator by +1/-1.
+ NumeratorFactor = Divisor.getSExtValue();
+ magics.Magic = 0;
+ magics.ShiftAmount = 0;
+ ShiftMask = 0;
+ } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) {
+ // If d > 0 and m < 0, add the numerator.
+ NumeratorFactor = 1;
+ } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) {
+ // If d < 0 and m > 0, subtract the numerator.
+ NumeratorFactor = -1;
+ }
+
+ MagicFactors.push_back(MIB.buildConstant(ScalarTy, magics.Magic).getReg(0));
+ Factors.push_back(MIB.buildConstant(ScalarTy, NumeratorFactor).getReg(0));
+ Shifts.push_back(
+ MIB.buildConstant(ScalarShiftAmtTy, magics.ShiftAmount).getReg(0));
+ ShiftMasks.push_back(MIB.buildConstant(ScalarTy, ShiftMask).getReg(0));
+
+ return true;
+ };
+
+ // Collect the shifts/magic values from each element.
bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern);
(void)Matched;
assert(Matched && "Expected unary predicate match to succeed");
- Register Shift, Factor;
- if (Ty.isVector()) {
- Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+ Register MagicFactor, Factor, Shift, ShiftMask;
+ auto *RHSDef = getOpcodeDef<GBuildVector>(RHS, MRI);
+ if (RHSDef) {
+ MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0);
Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
+ Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+ ShiftMask = MIB.buildBuildVector(Ty, ShiftMasks).getReg(0);
} else {
- Shift = Shifts[0];
+ assert(MRI.getType(RHS).isScalar() &&
+ "Non-build_vector operation should have been a scalar");
+ MagicFactor = MagicFactors[0];
Factor = Factors[0];
+ Shift = Shifts[0];
+ ShiftMask = ShiftMasks[0];
}
- Register Res = LHS;
+ Register Q = LHS;
+ Q = MIB.buildSMulH(Ty, LHS, MagicFactor).getReg(0);
+
+ // (Optionally) Add/subtract the numerator using Factor.
+ Factor = MIB.buildMul(Ty, LHS, Factor).getReg(0);
+ Q = MIB.buildAdd(Ty, Q, Factor).getReg(0);
- if (UseSRA)
- Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+ // Shift right algebraic by shift value.
+ Q = MIB.buildAShr(Ty, Q, Shift).getReg(0);
- return MIB.buildMul(Ty, Res, Factor);
+ // Extract the sign bit, mask it and add it to the quotient.
+ auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
+ auto T = MIB.buildLShr(Ty, Q, SignShift);
+ T = MIB.buildAnd(Ty, T, ShiftMask);
+ return MIB.buildAdd(Ty, Q, T);
}
bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll
new file mode 100644
index 0000000000000..b7dadf711fce1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll
@@ -0,0 +1,1663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; These tests are taken from the combine-udiv.ll in X86.
+define i32 @combine_sdiv_by_one(i32 %x) {
+; CHECK-LABEL: combine_sdiv_by_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %1 = sdiv i32 %x, 1
+ ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_one:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_one:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: ret
+ %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_by_negone(i32 %x) {
+; CHECK-LABEL: combine_sdiv_by_negone:
+; CHECK: // %bb.0:
+; CHECK-NEXT: neg w0, w0
+; CHECK-NEXT: ret
+ %1 = sdiv i32 %x, -1
+ ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_negone:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_negone:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: ret
+ %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_by_minsigned(i32 %x) {
+; CHECK-SD-LABEL: combine_sdiv_by_minsigned:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000
+; CHECK-SD-NEXT: cmp w0, w8
+; CHECK-SD-NEXT: cset w0, eq
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_sdiv_by_minsigned:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: sxtw x8, w0
+; CHECK-GI-NEXT: lsl x9, x8, #31
+; CHECK-GI-NEXT: sub x8, x9, x8
+; CHECK-GI-NEXT: asr x8, x8, #32
+; CHECK-GI-NEXT: sub w8, w8, w0
+; CHECK-GI-NEXT: asr w8, w8, #30
+; CHECK-GI-NEXT: add w0, w8, w8, lsr #31
+; CHECK-GI-NEXT: ret
+ %1 = sdiv i32 %x, -2147483648
+ ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_minsigned:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: movi v1.4s, #128, lsl #24
+; CHECK-SD-NEXT: movi v2.4s, #1
+; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_minsigned:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: sub v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #30
+; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #30
+; CHECK-GI-NEXT: ret
+ %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+ ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_zero(i32 %x) {
+; CHECK-LABEL: combine_sdiv_zero:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %1 = sdiv i32 0, %x
+ ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
+; CHECK-LABEL: combine_vec_sdiv_zero:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ret
+ %1 = sdiv <4 x i32> zeroinitializer, %x
+ ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_dupe(i32 %x) {
+; CHECK-SD-LABEL: combine_sdiv_dupe:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w0, #1 // =0x1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_sdiv_dupe:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sdiv w0, w0, w0
+; CHECK-GI-NEXT: ret
+ %1 = sdiv i32 %x, %x
+ ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_dupe:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: movi v0.4s, #1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_dupe:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w9, v0.s[1]
+; CHECK-GI-NEXT: mov w10, v0.s[2]
+; CHECK-GI-NEXT: mov w11, v0.s[3]
+; CHECK-GI-NEXT: sdiv w8, w8, w8
+; CHECK-GI-NEXT: sdiv w9, w9, w9
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: sdiv w10, w10, w10
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: sdiv w8, w11, w11
+; CHECK-GI-NEXT: mov v0.s[2], w10
+; CHECK-GI-NEXT: mov v0.s[3], w8
+; CHECK-GI-NEXT: ret
+ %1 = sdiv <4 x i32> %x, %x
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pos0:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pos0:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v1.4s, #1
+; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT: fneg v1.4s, v1.4s
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1
+; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1
+; CHECK-GI-NEXT: ret
+ %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
+ %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pos1:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff
+; CHECK-SD-NEXT: adrp x8, .LCPI11_0
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
+; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pos1:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT: adrp x8, .LCPI11_2
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI11_2]
+; CHECK-GI-NEXT: adrp x8, .LCPI11_1
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT: adrp x8, .LCPI11_0
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: neg v1.4s, v2.4s
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0]
+; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
+ %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
+ %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0
+; CHECK-SD-NEXT: usra v0.4s, v1.4s, #30
+; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v1.4s, #1
+; CHECK-GI-NEXT: fneg v1.4s, v1.4s
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1
+; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1
+; CHECK-GI-NEXT: ret
+ %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a_neg:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0
+; CHECK-SD-NEXT: usra v0.4s, v1.4s, #30
+; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #2
+; CHECK-SD-NEXT: neg v0.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a_neg:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: sub v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1
+; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1
+; CHECK-GI-NEXT: ret
+ %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
+ ret <4 x i32> %1
+}
+
+define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: adrp x8, .LCPI14_0
+; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0
+; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT: adrp x8, .LCPI14_1
+; CHECK-SD-NEXT: movi v4.2d, #0xffffffffffffff00
+; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_1]
+; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI14_2
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_2]
+; CHECK-GI-NEXT: adrp x8, .LCPI14_1
+; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: l...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/146504
More information about the llvm-commits
mailing list