[llvm] [LLVM][SelectionDAG] Don't legalise splat constants until required. (PR #143571)

Tue Jun 10 10:03:19 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-arm

Author: Paul Walker (paulwalker-arm)

<details>
<summary>Changes</summary>

This PR is another step in the direction of enabling ConstantInt for vector types.  The results look mostly positive to my untrained eye (NOTE: I'd like to ignore `test_compress_v1i32_with_sve` as being unrealistic given a single element compress should be canonicalised to a select?)

The exception is X86 where I'm in need of help.  The change to `SelectionDAG::getConstant()` causes several X86 unit tests to hang. Upon inspection I traced this to combineSelect in X86ISelLowering.cpp which inverts a `select` condition that DAGCombiner then undoes and we continually bonce between the two states. I'm guessing we're just lucky this is not biting us already and my `DCI.isBeforeLegalize()` addition only continues to hide the problem rather than fix it. Even with this change the results show a couple of cases where the restriction leads to worse code. Are there any recommendations?

---

Patch is 67.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143571.diff


26 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+68-68) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+3-2) 
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/srem-vector-lkk.ll (+33-33) 
- (modified) llvm/test/CodeGen/AArch64/ssub_sat_vec.ll (+1-3) 
- (modified) llvm/test/CodeGen/AArch64/sve-expand-div.ll (+1) 
- (modified) llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll (+1) 
- (modified) llvm/test/CodeGen/AArch64/sve-vector-compress.ll (+3-2) 
- (modified) llvm/test/CodeGen/AArch64/urem-vector-lkk.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/usub_sat_vec.ll (+1-3) 
- (modified) llvm/test/CodeGen/ARM/bool-ext-inc.ll (+2-3) 
- (modified) llvm/test/CodeGen/RISCV/rvv/combine-sats.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll (+1-10) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll (+17-30) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll (+6-6) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll (+68-128) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll (+23-48) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll (+78-148) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll (+1-1) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/avx512-select.ll (+8-4) 
- (modified) llvm/test/CodeGen/X86/vselect-zero.ll (+2-1) 
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+9-4) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4fc026ca562ba..1b9c28002b210 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1679,81 +1679,81 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
     Elt = ConstantInt::get(*getContext(), Elt->getValue());
 
   // In some cases the vector type is legal but the element type is illegal and
-  // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
-  // inserted value (the type does not need to match the vector element type).
-  // Any extra bits introduced will be truncated away.
-  if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
-                           TargetLowering::TypePromoteInteger) {
-    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-    APInt NewVal;
-    if (TLI->isSExtCheaperThanZExt(VT.getScalarType(), EltVT))
-      NewVal = Elt->getValue().sextOrTrunc(EltVT.getSizeInBits());
-    else
-      NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
-    Elt = ConstantInt::get(*getContext(), NewVal);
-  }
-  // In other cases the element type is illegal and needs to be expanded, for
-  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
-  // the value into n parts and use a vector type with n-times the elements.
-  // Then bitcast to the type requested.
-  // Legalizing constants too early makes the DAGCombiner's job harder so we
-  // only legalize if the DAG tells us we must produce legal types.
-  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
-           TLI->getTypeAction(*getContext(), EltVT) ==
-               TargetLowering::TypeExpandInteger) {
-    const APInt &NewVal = Elt->getValue();
-    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
-
-    // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node.
-    if (VT.isScalableVector() ||
-        TLI->isOperationLegal(ISD::SPLAT_VECTOR, VT)) {
-      assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 &&
-             "Can only handle an even split!");
-      unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits;
-
-      SmallVector<SDValue, 2> ScalarParts;
-      for (unsigned i = 0; i != Parts; ++i)
-        ScalarParts.push_back(getConstant(
-            NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
-            ViaEltVT, isT, isO));
-
-      return getNode(ISD::SPLAT_VECTOR_PARTS, DL, VT, ScalarParts);
-    }
+  // thus when necessary we "legalise" the constant here so as to simplify the
+  // job of calling this function.  NOTE: Only legalize when necessary so that
+  // we don't make DAGCombiner's job harder.
+  if (NewNodesMustHaveLegalTypes && VT.isVector()) {
+    // Promote the inserted value (the type does not need to match the vector
+    // element type). Any extra bits introduced will be truncated away.
+    if (TLI->getTypeAction(*getContext(), EltVT) ==
+        TargetLowering::TypePromoteInteger) {
+      EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+      APInt NewVal;
+      if (TLI->isSExtCheaperThanZExt(VT.getScalarType(), EltVT))
+        NewVal = Elt->getValue().sextOrTrunc(EltVT.getSizeInBits());
+      else
+        NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
+      Elt = ConstantInt::get(*getContext(), NewVal);
+    }
+    // For expansion we find the nearest legal type, split the value into n
+    // parts and use a vector type with n-times the elements. Then bitcast to
+    // the type requested.
+    else if (TLI->getTypeAction(*getContext(), EltVT) ==
+             TargetLowering::TypeExpandInteger) {
+      const APInt &NewVal = Elt->getValue();
+      EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+      unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+
+      // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node.
+      if (VT.isScalableVector() ||
+          TLI->isOperationLegal(ISD::SPLAT_VECTOR, VT)) {
+        assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 &&
+               "Can only handle an even split!");
+        unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits;
+
+        SmallVector<SDValue, 2> ScalarParts;
+        for (unsigned i = 0; i != Parts; ++i)
+          ScalarParts.push_back(getConstant(
+              NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
+              ViaEltVT, isT, isO));
+
+        return getNode(ISD::SPLAT_VECTOR_PARTS, DL, VT, ScalarParts);
+      }
 
-    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
-    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+      unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+      EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
 
-    // Check the temporary vector is the correct size. If this fails then
-    // getTypeToTransformTo() probably returned a type whose size (in bits)
-    // isn't a power-of-2 factor of the requested type size.
-    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+      // Check the temporary vector is the correct size. If this fails then
+      // getTypeToTransformTo() probably returned a type whose size (in bits)
+      // isn't a power-of-2 factor of the requested type size.
+      assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
 
-    SmallVector<SDValue, 2> EltParts;
-    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i)
-      EltParts.push_back(getConstant(
-          NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
-          ViaEltVT, isT, isO));
+      SmallVector<SDValue, 2> EltParts;
+      for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i)
+        EltParts.push_back(getConstant(
+            NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
+            ViaEltVT, isT, isO));
 
-    // EltParts is currently in little endian order. If we actually want
-    // big-endian order then reverse it now.
-    if (getDataLayout().isBigEndian())
-      std::reverse(EltParts.begin(), EltParts.end());
+      // EltParts is currently in little endian order. If we actually want
+      // big-endian order then reverse it now.
+      if (getDataLayout().isBigEndian())
+        std::reverse(EltParts.begin(), EltParts.end());
 
-    // The elements must be reversed when the element order is different
-    // to the endianness of the elements (because the BITCAST is itself a
-    // vector shuffle in this situation). However, we do not need any code to
-    // perform this reversal because getConstant() is producing a vector
-    // splat.
-    // This situation occurs in MIPS MSA.
+      // The elements must be reversed when the element order is different
+      // to the endianness of the elements (because the BITCAST is itself a
+      // vector shuffle in this situation). However, we do not need any code to
+      // perform this reversal because getConstant() is producing a vector
+      // splat.
+      // This situation occurs in MIPS MSA.
 
-    SmallVector<SDValue, 8> Ops;
-    for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-      llvm::append_range(Ops, EltParts);
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+        llvm::append_range(Ops, EltParts);
 
-    SDValue V =
-        getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
-    return V;
+      SDValue V =
+          getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
+      return V;
+    }
   }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bf5ba25cd3104..3ac53b63b64e5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48121,8 +48121,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Check if the first operand is all zeros and Cond type is vXi1.
   // If this an avx512 target we can improve the use of zero masking by
   // swapping the operands and inverting the condition.
-  if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
-      Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
+      Cond.hasOneUse() && Subtarget.hasAVX512() &&
+      CondVT.getVectorElementType() == MVT::i1 &&
       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
index bdbebd8726fde..1be02ae602a3c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
@@ -8,9 +8,9 @@ define <16 x i8> @div16xi8(<16 x i8> %x) {
 ; CHECK-SD-NEXT:    movi v1.16b, #41
 ; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT:    sshr v0.16b, v0.16b, #2
-; CHECK-SD-NEXT:    usra v0.16b, v0.16b, #7
+; CHECK-SD-NEXT:    uzp2 v1.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    sshr v0.16b, v1.16b, #2
+; CHECK-SD-NEXT:    usra v0.16b, v1.16b, #7
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: div16xi8:
@@ -78,9 +78,9 @@ define <8 x i16> @div8xi16(<8 x i16> %x) {
 ; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
 ; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-SD-NEXT:    add v0.8h, v1.8h, v0.8h
-; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #12
-; CHECK-SD-NEXT:    usra v0.8h, v0.8h, #15
+; CHECK-SD-NEXT:    add v1.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sshr v0.8h, v1.8h, #12
+; CHECK-SD-NEXT:    usra v0.8h, v1.8h, #15
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: div8xi16:
diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
index b165ac0d56d20..6c8ebc65a327c 100644
--- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
@@ -14,10 +14,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; CHECK-NEXT:    mla v1.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_2]
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_3]
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    sshl v2.4h, v1.4h, v2.4h
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    mls v0.4h, v2.4h, v1.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
   ret <4 x i16> %1
@@ -27,14 +27,14 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; CHECK-LABEL: fold_srem_vec_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #44151 // =0xac77
-; CHECK-NEXT:    movi v2.4h, #95
+; CHECK-NEXT:    movi v3.4h, #95
 ; CHECK-NEXT:    dup v1.4h, w8
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    sshr v1.4h, v1.4h, #6
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    sshr v2.4h, v1.4h, #6
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    mls v0.4h, v2.4h, v3.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -46,15 +46,15 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; CHECK-LABEL: combine_srem_sdiv:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #44151 // =0xac77
-; CHECK-NEXT:    movi v2.4h, #95
+; CHECK-NEXT:    movi v3.4h, #95
 ; CHECK-NEXT:    dup v1.4h, w8
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    sshr v1.4h, v1.4h, #6
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    sshr v2.4h, v1.4h, #6
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    mls v0.4h, v2.4h, v3.4h
+; CHECK-NEXT:    add v0.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -74,10 +74,10 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    sshl v2.4h, v1.4h, v2.4h
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_2]
+; CHECK-NEXT:    mls v0.4h, v2.4h, v1.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
   ret <4 x i16> %1
@@ -91,14 +91,14 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT:    adrp x8, .LCPI4_2
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    and v2.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ushr v2.4h, v1.4h, #15
+; CHECK-NEXT:    sshl v1.4h, v1.4h, v3.4h
 ; CHECK-NEXT:    mov v2.h[0], wzr
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_2]
@@ -118,12 +118,12 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) {
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_2
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI5_2]
+; CHECK-NEXT:    adrp x8, .LCPI5_3
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    mla v1.4h, v0.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_2]
-; CHECK-NEXT:    adrp x8, .LCPI5_3
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ushr v2.4h, v1.4h, #15
+; CHECK-NEXT:    sshl v1.4h, v1.4h, v3.4h
 ; CHECK-NEXT:    mov v2.h[0], wzr
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_3]
@@ -181,13 +181,13 @@ define <16 x i8> @fold_srem_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: fold_srem_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.16b, #103
+; CHECK-NEXT:    movi v3.16b, #10
 ; CHECK-NEXT:    smull2 v2.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    smull v1.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    movi v2.16b, #10
-; CHECK-NEXT:    sshr v1.16b, v1.16b, #2
-; CHECK-NEXT:    usra v1.16b, v1.16b, #7
-; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sshr v2.16b, v1.16b, #2
+; CHECK-NEXT:    usra v2.16b, v1.16b, #7
+; CHECK-NEXT:    mls v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = srem <16 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
   ret <16 x i8> %1
@@ -199,8 +199,8 @@ define <8 x i8> @fold_srem_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    movi v1.8b, #103
 ; CHECK-NEXT:    movi v2.8b, #10
 ; CHECK-NEXT:    smull v1.8h, v0.8b, v1.8b
-; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
-; CHECK-NEXT:    sshr v1.8b, v1.8b, #2
+; CHECK-NEXT:    sshr v1.8h, v1.8h, #10
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    usra v1.8b, v1.8b, #7
 ; CHECK-NEXT:    mls v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
@@ -212,14 +212,14 @@ define <8 x i16> @fold_srem_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: fold_srem_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #26215 // =0x6667
+; CHECK-NEXT:    movi v3.8h, #10
 ; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:    smull2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    movi v2.8h, #10
-; CHECK-NEXT:    sshr v1.8h, v1.8h, #2
-; CHECK-NEXT:    usra v1.8h, v1.8h, #15
-; CHECK-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    sshr v2.8h, v1.8h, #2
+; CHECK-NEXT:    usra v2.8h, v1.8h, #15
+; CHECK-NEXT:    mls v0.8h, v2.8h, v3.8h
 ; CHECK-NEXT:    ret
   %1 = srem <8 x i16> %x, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   ret <8 x i16> %1
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 3af858713525b..7e95f61604620 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -356,9 +356,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.16b, #1
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 180c64e0a7de1..bd6c72a3946c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 ; Check that expensive divides are expanded into a more performant sequence
 
diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
index 4607f225f81ea..a799b51f15cb1 100644
--- a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index 8a504cd739211..944071b9d2161 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -176,10 +176,11 @@ define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) {
 ; CHECK-LABEL: test_compress_v1i32_with_sve:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    sbfx w8, w0, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    mov v1.s[0], w8
+; CHECK-NEXT:    mov v1.s[0], w0
+; CHECK-NEXT:    shl v1.2s, v1.2s, #31
+; CHECK-NEXT:    cmlt v1.2s, v1.2s, #0
 ; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
index 468a33ce5bfcf..bd7952a7992c6 100644
--- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
@@ -186,8 +186,8 @@ define <8 x i8> @fold_urem_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    movi v1.8b, #205
 ; CHECK-NEXT:    movi v2.8b, #10
 ; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
-; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
-; CHECK-NEXT:    ushr v1.8b, v1.8b, #3
+; CHECK-NEXT:    ushr v1.8h, v1.8h, #11
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    mls v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %1 = urem <8 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a71cf95a728db..34d9294ac7f3c 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/143571