[llvm] 1997073 - [LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. (#134116)

Tue Apr 8 03:38:30 PDT 2025

Author: Paul Walker
Date: 2025-04-08T11:38:27+01:00
New Revision: 1997073a547528f94b11a111a2eae3676a216127

URL: https://github.com/llvm/llvm-project/commit/1997073a547528f94b11a111a2eae3676a216127
DIFF: https://github.com/llvm/llvm-project/commit/1997073a547528f94b11a111a2eae3676a216127.diff

LOG: [LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. (#134116)

After https://github.com/llvm/llvm-project/issues/126928 it's now
possible to rewrite the existing combines, which mostly only handle
cases where a operand is an identity value, to use existing simplify
code to unlock general constant folding.

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 417af74f712e7..234dfa07bdc1a 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1111,6 +1111,19 @@ struct SVEIntrinsicInfo {
     return *this;
   }
 
+  bool hasMatchingIROpode() const { return IROpcode != 0; }
+
+  unsigned getMatchingIROpode() const {
+    assert(hasMatchingIROpode() && "Propery not set!");
+    return IROpcode;
+  }
+
+  SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
+    assert(!hasMatchingIROpode() && "Cannot set property twice!");
+    IROpcode = Opcode;
+    return *this;
+  }
+
   //
   // Properties relating to the result of inactive lanes.
   //
@@ -1186,6 +1199,7 @@ struct SVEIntrinsicInfo {
   unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
 
   Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
+  unsigned IROpcode = 0;
 
   enum PredicationStyle {
     Uninitialized,
@@ -1269,7 +1283,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
   case Intrinsic::aarch64_sve_fmls:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
   case Intrinsic::aarch64_sve_fmul:
-    return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u);
+    return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
+        .setMatchingIROpcode(Instruction::FMul);
   case Intrinsic::aarch64_sve_fmulx:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
   case Intrinsic::aarch64_sve_fnmla:
@@ -1285,7 +1300,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
   case Intrinsic::aarch64_sve_mls:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
   case Intrinsic::aarch64_sve_mul:
-    return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u);
+    return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
+        .setMatchingIROpcode(Instruction::Mul);
   case Intrinsic::aarch64_sve_sabd:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
   case Intrinsic::aarch64_sve_smax:
@@ -1323,6 +1339,13 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
   case Intrinsic::aarch64_sve_uqsub:
     return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
 
+  case Intrinsic::aarch64_sve_fmul_u:
+    return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+        Instruction::FMul);
+  case Intrinsic::aarch64_sve_mul_u:
+    return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+        Instruction::Mul);
+
   case Intrinsic::aarch64_sve_addqv:
   case Intrinsic::aarch64_sve_and_z:
   case Intrinsic::aarch64_sve_bic_z:
@@ -2205,45 +2228,63 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
   return std::nullopt;
 }
 
-static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
-                                                            IntrinsicInst &II) {
-  auto *OpPredicate = II.getOperand(0);
-  auto *OpMultiplicand = II.getOperand(1);
-  auto *OpMultiplier = II.getOperand(2);
+// Simplify `V` by only considering the operations that affect active lanes.
+// This function should only return existing Values or newly created Constants.
+static Value *stripInactiveLanes(Value *V, const Value *Pg) {
+  auto *Dup = dyn_cast<IntrinsicInst>(V);
+  if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
+      Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
+    return ConstantVector::getSplat(
+        cast<VectorType>(V->getType())->getElementCount(),
+        cast<Constant>(Dup->getOperand(2)));
+
+  return V;
+}
 
-  // Return true if a given instruction is a unit splat value, false otherwise.
-  auto IsUnitSplat = [](auto *I) {
-    auto *SplatValue = getSplatValue(I);
-    if (!SplatValue)
-      return false;
-    return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
-  };
+static std::optional<Instruction *>
+instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II,
+                        const SVEIntrinsicInfo &IInfo) {
+  const unsigned Opc = IInfo.getMatchingIROpode();
+  if (!Instruction::isBinaryOp(Opc))
+    return std::nullopt;
 
-  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
-  // with a unit splat value, false otherwise.
-  auto IsUnitDup = [](auto *I) {
-    auto *IntrI = dyn_cast<IntrinsicInst>(I);
-    if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
-      return false;
+  Value *Pg = II.getOperand(0);
+  Value *Op1 = II.getOperand(1);
+  Value *Op2 = II.getOperand(2);
+  const DataLayout &DL = II.getDataLayout();
 
-    auto *SplatValue = IntrI->getOperand(2);
-    return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
-  };
+  // Canonicalise constants to the RHS.
+  if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&
+      isa<Constant>(Op1) && !isa<Constant>(Op2)) {
+    IC.replaceOperand(II, 1, Op2);
+    IC.replaceOperand(II, 2, Op1);
+    return &II;
+  }
 
-  if (IsUnitSplat(OpMultiplier)) {
-    // [f]mul pg %n, (dupx 1) => %n
-    OpMultiplicand->takeName(&II);
-    return IC.replaceInstUsesWith(II, OpMultiplicand);
-  } else if (IsUnitDup(OpMultiplier)) {
-    // [f]mul pg %n, (dup pg 1) => %n
-    auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
-    auto *DupPg = DupInst->getOperand(1);
-    // TODO: this is naive. The optimization is still valid if DupPg
-    // 'encompasses' OpPredicate, not only if they're the same predicate.
-    if (OpPredicate == DupPg) {
-      OpMultiplicand->takeName(&II);
-      return IC.replaceInstUsesWith(II, OpMultiplicand);
-    }
+  // Only active lanes matter when simplifying the operation.
+  Op1 = stripInactiveLanes(Op1, Pg);
+  Op2 = stripInactiveLanes(Op2, Pg);
+
+  Value *SimpleII;
+  if (auto FII = dyn_cast<FPMathOperator>(&II))
+    SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
+  else
+    SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
+
+  if (SimpleII) {
+    if (IInfo.inactiveLanesAreNotDefined())
+      return IC.replaceInstUsesWith(II, SimpleII);
+
+    Value *Inactive =
+        II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
+
+    // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
+    if (SimpleII == Inactive)
+      return IC.replaceInstUsesWith(II, SimpleII);
+
+    // Inactive lanes must be preserved.
+    SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
+    return IC.replaceInstUsesWith(II, SimpleII);
   }
 
   return instCombineSVEVectorBinOp(IC, II);
@@ -2650,9 +2691,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_fadd_u:
     return instCombineSVEVectorFAddU(IC, II);
   case Intrinsic::aarch64_sve_fmul:
-    return instCombineSVEVectorMul(IC, II);
+    return instCombineSVEVectorMul(IC, II, IInfo);
   case Intrinsic::aarch64_sve_fmul_u:
-    return instCombineSVEVectorMul(IC, II);
+    return instCombineSVEVectorMul(IC, II, IInfo);
   case Intrinsic::aarch64_sve_fsub:
     return instCombineSVEVectorFSub(IC, II);
   case Intrinsic::aarch64_sve_fsub_u:
@@ -2664,9 +2705,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                              Intrinsic::aarch64_sve_mla_u>(
         IC, II, true);
   case Intrinsic::aarch64_sve_mul:
-    return instCombineSVEVectorMul(IC, II);
+    return instCombineSVEVectorMul(IC, II, IInfo);
   case Intrinsic::aarch64_sve_mul_u:
-    return instCombineSVEVectorMul(IC, II);
+    return instCombineSVEVectorMul(IC, II, IInfo);
   case Intrinsic::aarch64_sve_sub:
     return instCombineSVEVectorSub(IC, II);
   case Intrinsic::aarch64_sve_sub_u:

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
index f612e5a44ebba..3b37e2c1fddef 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Idempotent fmuls -- should compile to just a ret.
 define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_f16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
   %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x
 
 define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_f32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A]]
 ;
   %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
   %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale
 
 define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_f64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
   %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
 define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(
 ; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> splat (double 1.000000e+00)
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
@@ -48,8 +48,8 @@ define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x
 
 define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, half 1.0)
   %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
index bd3d7be0a1b80..38ed4272d826c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vsca
 
 define <vscale x 2 x double> @idempotent_fmul_u_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_
diff erent_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
   ; Different argument order to the above tests.

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
index cbdcfc6b110b3..602db4eb1d429 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Idempotent muls -- should compile to just a ret.
 define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_i16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[A]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
   %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8
 
 define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_i32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
 ;
   %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
   %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4
 
 define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_i64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[A]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
   %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
 define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(
 ; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
@@ -48,8 +48,8 @@ define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x
 
 define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[A]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, i16 1)
   %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
index 8144e56b979f0..e899c787aa555 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x
 
 define <vscale x 2 x i64> @idempotent_mul_u_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_
diff erent_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
   ; Different argument order to the above tests.

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
new file mode 100644
index 0000000000000..7da55a199df28
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; The follow tests verify the mechanics of simplification. The operation is not
+; important beyond being commutative with a known identity value.
+
+define <vscale x 4 x i32> @commute_constant_to_rhs(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @commute_constant_to_rhs(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[R:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 303))
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 303), <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %r
+}
+
+; Inactive lanes are important, which make the operation non-commutative.
+define <vscale x 4 x i32> @cannot_commute_constant_to_rhs(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @cannot_commute_constant_to_rhs(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 303), <vscale x 4 x i32> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 303), <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul_ops_reverse(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_ops_reverse(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 1)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 1), <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul_u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_u(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul_u_ops_reverse(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_u_ops_reverse(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 1), <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %r
+}
+
+; Show that we only need to know the active lanes are constant.
+; TODO: We can do better here because we can use %a directly as part of the
+; select because we know only its inactive lanes will be used.
+define <vscale x 4 x i32> @constant_mul_after_striping_inactive_lanes(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul_after_striping_inactive_lanes(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A_DUP:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], i32 3)
+; CHECK-NEXT:    [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 6), <vscale x 4 x i32> [[A_DUP]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %a.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, i32 3)
+  %b.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i1> %pg, i32 2)
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a.dup, <vscale x 4 x i32> %b.dup)
+  ret <vscale x 4 x i32> %r
+}
+
+; Show that we only need to know the active lanes are constant.
+define <vscale x 4 x i32> @constant_mul_u_after_striping_inactive_lanes(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul_u_after_striping_inactive_lanes(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> splat (i32 6)
+;
+  %a.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, i32 3)
+  %b.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i1> %pg, i32 2)
+  %3 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a.dup, <vscale x 4 x i32> %b.dup)
+  ret <vscale x 4 x i32> %3
+}
+
+; The follow tests demonstrate the operations for which hooks are in place to
+; enable simplification. Given the simplications themselves are common code, it
+; is assumed they are already well tested elsewhere.
+
+define <vscale x 4 x float> @constant_fmul(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @constant_fmul(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x float> splat (float 4.200000e+01), <vscale x 4 x float> splat (float 7.000000e+00)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[R]]
+;
+  %r = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> splat (float 7.0), <vscale x 4 x float> splat (float 6.0))
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @constant_fmul_u(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @constant_fmul_u(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> splat (float 4.200000e+01)
+;
+  %r = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> splat (float 7.0), <vscale x 4 x float> splat (float 6.0))
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x i32> @constant_mul(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 21), <vscale x 4 x i32> splat (i32 7)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 7), <vscale x 4 x i32> splat (i32 3))
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @constant_mul_u(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul_u(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> splat (i32 21)
+;
+  %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 7), <vscale x 4 x i32> splat (i32 3))
+  ret <vscale x 4 x i32> %r
+}
+
+
+; repeat only the constant fold tests for fmul(.u)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+
+attributes #0 = { "target-features"="+sve" }