[llvm] [GlobaISel] Allow expanding of sdiv -> mul by constant (PR #146504)

Tue Jul 1 03:35:41 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: None (jyli0116)

<details>
<summary>Changes</summary>

Allows expand of sdiv->mul by constant combine for the general case. Previously this was only occurring in the exact case. This is part of the resolution to issue #118090 

---

Patch is 115.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146504.diff


12 Files Affected:

- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+4) 
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+2-2) 
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+114-20) 
- (added) llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll (+1663) 
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir (+8-3) 
- (modified) llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll (+6-3) 
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll (+24-90) 
- (modified) llvm/test/CodeGen/AArch64/select_const.ll (+3-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+48-77) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll (+407-104) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index c15263e0b06f8..5568139af98d2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -143,6 +143,10 @@ class CombinerHelper {
   /// Query is legal on the target.
   bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const;
 
+  /// \return true if \p Query is legal on the target, or if \p Query will
+  /// perform WidenScalar action on the target.
+  bool isLegalorHasWidenScalar(const LegalityQuery &Query) const;
+
   /// \return true if the combine is running prior to legalization, or if \p Ty
   /// is a legal integer constant type on the target.
   bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4a92dc16c1bf4..347a5f85affb1 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -2046,9 +2046,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift,
     form_bitfield_extract, constant_fold_binops, constant_fold_fma,
     constant_fold_cast_op, fabs_fneg_fold,
-    intdiv_combines, mulh_combines, redundant_neg_operands,
+    mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
-    sub_add_reg, select_to_minmax,
+    intdiv_combines, sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     simplify_neg_minmax, combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 05dd269d48921..b866982faa1d2 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -162,6 +162,11 @@ bool CombinerHelper::isLegalOrBeforeLegalizer(
   return isPreLegalize() || isLegal(Query);
 }
 
+bool CombinerHelper::isLegalorHasWidenScalar(const LegalityQuery &Query) const {
+  return isLegal(Query) ||
+         LI->getAction(Query).Action == LegalizeActions::WidenScalar;
+}
+
 bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const {
   if (!Ty.isVector())
     return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}});
@@ -5510,6 +5515,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
+  auto SizeInBits = DstTy.getScalarSizeInBits();
+  LLT WideTy = DstTy.changeElementSize(SizeInBits * 2);
 
   auto &MF = *MI.getMF();
   AttributeList Attr = MF.getFunction().getAttributes();
@@ -5529,8 +5536,21 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
 
-  // Don't support the general case for now.
-  return false;
+  auto *RHSDef = MRI.getVRegDef(RHS);
+  if (!isConstantOrConstantVector(*RHSDef, MRI))
+    return false;
+
+  // Don't do this if the types are not going to be legal.
+  if (LI) {
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
+      return false;
+    if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
+        !isLegalorHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
+      return false;
+  }
+
+  return matchUnaryPredicate(
+      MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
 void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
@@ -5546,21 +5566,22 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
   Register RHS = SDiv.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
+  const unsigned EltBits = ScalarTy.getScalarSizeInBits();
   LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
   LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
   auto &MIB = Builder;
 
   bool UseSRA = false;
-  SmallVector<Register, 16> Shifts, Factors;
+  SmallVector<Register, 16> ExactShifts, ExactFactors;
 
-  auto *RHSDef = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
-  bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).has_value();
+  auto *RHSDefInstr = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
+  bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value();
 
-  auto BuildSDIVPattern = [&](const Constant *C) {
+  auto BuildExactSDIVPattern = [&](const Constant *C) {
     // Don't recompute inverses for each splat element.
-    if (IsSplat && !Factors.empty()) {
-      Shifts.push_back(Shifts[0]);
-      Factors.push_back(Factors[0]);
+    if (IsSplat && !ExactFactors.empty()) {
+      ExactShifts.push_back(ExactShifts[0]);
+      ExactFactors.push_back(ExactFactors[0]);
       return true;
     }
 
@@ -5575,31 +5596,104 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
     // Calculate the multiplicative inverse modulo BW.
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     APInt Factor = Divisor.multiplicativeInverse();
-    Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
-    Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
+    ExactShifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
+    ExactFactors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
     return true;
   };
 
-  // Collect all magic values from the build vector.
+  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+    // Collect all magic values from the build vector.
+    bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactSDIVPattern);
+    (void)Matched;
+    assert(Matched && "Expected unary predicate match to succeed");
+
+    Register Shift, Factor;
+    if (Ty.isVector()) {
+      Shift = MIB.buildBuildVector(ShiftAmtTy, ExactShifts).getReg(0);
+      Factor = MIB.buildBuildVector(Ty, ExactFactors).getReg(0);
+    } else {
+      Shift = ExactShifts[0];
+      Factor = ExactFactors[0];
+    }
+
+    Register Res = LHS;
+
+    if (UseSRA)
+      Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+
+    return MIB.buildMul(Ty, Res, Factor);
+  }
+
+  SmallVector<Register, 16> MagicFactors, Factors, Shifts, ShiftMasks;
+
+  auto BuildSDIVPattern = [&](const Constant *C) {
+    auto *CI = cast<ConstantInt>(C);
+    const APInt &Divisor = CI->getValue();
+
+    SignedDivisionByConstantInfo magics =
+        SignedDivisionByConstantInfo::get(Divisor);
+    int NumeratorFactor = 0;
+    int ShiftMask = -1;
+
+    if (Divisor.isOne() || Divisor.isAllOnes()) {
+      // If d is +1/-1, we just multiply the numerator by +1/-1.
+      NumeratorFactor = Divisor.getSExtValue();
+      magics.Magic = 0;
+      magics.ShiftAmount = 0;
+      ShiftMask = 0;
+    } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) {
+      // If d > 0 and m < 0, add the numerator.
+      NumeratorFactor = 1;
+    } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) {
+      // If d < 0 and m > 0, subtract the numerator.
+      NumeratorFactor = -1;
+    }
+
+    MagicFactors.push_back(MIB.buildConstant(ScalarTy, magics.Magic).getReg(0));
+    Factors.push_back(MIB.buildConstant(ScalarTy, NumeratorFactor).getReg(0));
+    Shifts.push_back(
+        MIB.buildConstant(ScalarShiftAmtTy, magics.ShiftAmount).getReg(0));
+    ShiftMasks.push_back(MIB.buildConstant(ScalarTy, ShiftMask).getReg(0));
+
+    return true;
+  };
+
+  // Collect the shifts/magic values from each element.
   bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern);
   (void)Matched;
   assert(Matched && "Expected unary predicate match to succeed");
 
-  Register Shift, Factor;
-  if (Ty.isVector()) {
-    Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+  Register MagicFactor, Factor, Shift, ShiftMask;
+  auto *RHSDef = getOpcodeDef<GBuildVector>(RHS, MRI);
+  if (RHSDef) {
+    MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0);
     Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
+    Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+    ShiftMask = MIB.buildBuildVector(Ty, ShiftMasks).getReg(0);
   } else {
-    Shift = Shifts[0];
+    assert(MRI.getType(RHS).isScalar() &&
+           "Non-build_vector operation should have been a scalar");
+    MagicFactor = MagicFactors[0];
     Factor = Factors[0];
+    Shift = Shifts[0];
+    ShiftMask = ShiftMasks[0];
   }
 
-  Register Res = LHS;
+  Register Q = LHS;
+  Q = MIB.buildSMulH(Ty, LHS, MagicFactor).getReg(0);
+
+  // (Optionally) Add/subtract the numerator using Factor.
+  Factor = MIB.buildMul(Ty, LHS, Factor).getReg(0);
+  Q = MIB.buildAdd(Ty, Q, Factor).getReg(0);
 
-  if (UseSRA)
-    Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+  // Shift right algebraic by shift value.
+  Q = MIB.buildAShr(Ty, Q, Shift).getReg(0);
 
-  return MIB.buildMul(Ty, Res, Factor);
+  // Extract the sign bit, mask it and add it to the quotient.
+  auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
+  auto T = MIB.buildLShr(Ty, Q, SignShift);
+  T = MIB.buildAnd(Ty, T, ShiftMask);
+  return MIB.buildAdd(Ty, Q, T);
 }
 
 bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll
new file mode 100644
index 0000000000000..b7dadf711fce1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll
@@ -0,0 +1,1663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; These tests are taken from the combine-udiv.ll in X86.
+define i32 @combine_sdiv_by_one(i32 %x) {
+; CHECK-LABEL: combine_sdiv_by_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %1 = sdiv i32 %x, 1
+  ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_one:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_one:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_by_negone(i32 %x) {
+; CHECK-LABEL: combine_sdiv_by_negone:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w0, w0
+; CHECK-NEXT:    ret
+  %1 = sdiv i32 %x, -1
+  ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_negone:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_negone:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_by_minsigned(i32 %x) {
+; CHECK-SD-LABEL: combine_sdiv_by_minsigned:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-SD-NEXT:    cmp w0, w8
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_sdiv_by_minsigned:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    sxtw x8, w0
+; CHECK-GI-NEXT:    lsl x9, x8, #31
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    sub w8, w8, w0
+; CHECK-GI-NEXT:    asr w8, w8, #30
+; CHECK-GI-NEXT:    add w0, w8, w8, lsr #31
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv i32 %x, -2147483648
+  ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_minsigned:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v1.4s, #128, lsl #24
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_minsigned:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshr v0.4s, v1.4s, #30
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    ssra v0.4s, v1.4s, #30
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_zero(i32 %x) {
+; CHECK-LABEL: combine_sdiv_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %1 = sdiv i32 0, %x
+  ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
+; CHECK-LABEL: combine_vec_sdiv_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  %1 = sdiv <4 x i32> zeroinitializer, %x
+  ret <4 x i32> %1
+}
+
+define i32 @combine_sdiv_dupe(i32 %x) {
+; CHECK-SD-LABEL: combine_sdiv_dupe:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_sdiv_dupe:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sdiv w0, w0, w0
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv i32 %x, %x
+  ret i32 %1
+}
+
+define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_dupe:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_dupe:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov w10, v0.s[2]
+; CHECK-GI-NEXT:    mov w11, v0.s[3]
+; CHECK-GI-NEXT:    sdiv w8, w8, w8
+; CHECK-GI-NEXT:    sdiv w9, w9, w9
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w10
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    sdiv w8, w11, w11
+; CHECK-GI-NEXT:    mov v0.s[2], w10
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv <4 x i32> %x, %x
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pos0:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushr v0.4s, v0.4s, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pos0:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshr v0.4s, v1.4s, #1
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    ssra v0.4s, v1.4s, #1
+; CHECK-GI-NEXT:    ret
+  %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
+  %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pos1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-SD-NEXT:    adrp x8, .LCPI11_0
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI11_0]
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pos1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_2
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI11_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_1
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    neg v1.4s, v2.4s
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ushr v1.4s, v0.4s, #31
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
+  %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
+  %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v1.4s, v0.4s, #0
+; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #30
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshr v0.4s, v1.4s, #1
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    ssra v0.4s, v1.4s, #1
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a_neg:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v1.4s, v0.4s, #0
+; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #30
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #2
+; CHECK-SD-NEXT:    neg v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a_neg:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshr v0.4s, v1.4s, #1
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    ssra v0.4s, v1.4s, #1
+; CHECK-GI-NEXT:    ret
+  %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
+  ret <4 x i32> %1
+}
+
+define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    cmlt v1.16b, v0.16b, #0
+; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_1
+; CHECK-SD-NEXT:    movi v4.2d, #0xffffffffffffff00
+; CHECK-SD-NEXT:    ushl v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI14_1]
+; CHECK-SD-NEXT:    add v1.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    sshl v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_2
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
+; CHECK-GI-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    l...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/146504