[llvm] 1997073 - [LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. (#134116)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 8 03:38:30 PDT 2025
Author: Paul Walker
Date: 2025-04-08T11:38:27+01:00
New Revision: 1997073a547528f94b11a111a2eae3676a216127
URL: https://github.com/llvm/llvm-project/commit/1997073a547528f94b11a111a2eae3676a216127
DIFF: https://github.com/llvm/llvm-project/commit/1997073a547528f94b11a111a2eae3676a216127.diff
LOG: [LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. (#134116)
After https://github.com/llvm/llvm-project/issues/126928 it's now
possible to rewrite the existing combines, which mostly only handle
cases where a operand is an identity value, to use existing simplify
code to unlock general constant folding.
Added:
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
Modified:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 417af74f712e7..234dfa07bdc1a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1111,6 +1111,19 @@ struct SVEIntrinsicInfo {
return *this;
}
+ bool hasMatchingIROpode() const { return IROpcode != 0; }
+
+ unsigned getMatchingIROpode() const {
+ assert(hasMatchingIROpode() && "Propery not set!");
+ return IROpcode;
+ }
+
+ SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
+ assert(!hasMatchingIROpode() && "Cannot set property twice!");
+ IROpcode = Opcode;
+ return *this;
+ }
+
//
// Properties relating to the result of inactive lanes.
//
@@ -1186,6 +1199,7 @@ struct SVEIntrinsicInfo {
unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
+ unsigned IROpcode = 0;
enum PredicationStyle {
Uninitialized,
@@ -1269,7 +1283,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_fmls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
- return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u);
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
+ .setMatchingIROpcode(Instruction::FMul);
case Intrinsic::aarch64_sve_fmulx:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
@@ -1285,7 +1300,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_mls:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
- return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u);
+ return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
+ .setMatchingIROpcode(Instruction::Mul);
case Intrinsic::aarch64_sve_sabd:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
@@ -1323,6 +1339,13 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
case Intrinsic::aarch64_sve_uqsub:
return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
+ case Intrinsic::aarch64_sve_fmul_u:
+ return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+ Instruction::FMul);
+ case Intrinsic::aarch64_sve_mul_u:
+ return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
+ Instruction::Mul);
+
case Intrinsic::aarch64_sve_addqv:
case Intrinsic::aarch64_sve_and_z:
case Intrinsic::aarch64_sve_bic_z:
@@ -2205,45 +2228,63 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
return std::nullopt;
}
-static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
- IntrinsicInst &II) {
- auto *OpPredicate = II.getOperand(0);
- auto *OpMultiplicand = II.getOperand(1);
- auto *OpMultiplier = II.getOperand(2);
+// Simplify `V` by only considering the operations that affect active lanes.
+// This function should only return existing Values or newly created Constants.
+static Value *stripInactiveLanes(Value *V, const Value *Pg) {
+ auto *Dup = dyn_cast<IntrinsicInst>(V);
+ if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
+ Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
+ return ConstantVector::getSplat(
+ cast<VectorType>(V->getType())->getElementCount(),
+ cast<Constant>(Dup->getOperand(2)));
+
+ return V;
+}
- // Return true if a given instruction is a unit splat value, false otherwise.
- auto IsUnitSplat = [](auto *I) {
- auto *SplatValue = getSplatValue(I);
- if (!SplatValue)
- return false;
- return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
- };
+static std::optional<Instruction *>
+instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II,
+ const SVEIntrinsicInfo &IInfo) {
+ const unsigned Opc = IInfo.getMatchingIROpode();
+ if (!Instruction::isBinaryOp(Opc))
+ return std::nullopt;
- // Return true if a given instruction is an aarch64_sve_dup intrinsic call
- // with a unit splat value, false otherwise.
- auto IsUnitDup = [](auto *I) {
- auto *IntrI = dyn_cast<IntrinsicInst>(I);
- if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
- return false;
+ Value *Pg = II.getOperand(0);
+ Value *Op1 = II.getOperand(1);
+ Value *Op2 = II.getOperand(2);
+ const DataLayout &DL = II.getDataLayout();
- auto *SplatValue = IntrI->getOperand(2);
- return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
- };
+ // Canonicalise constants to the RHS.
+ if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&
+ isa<Constant>(Op1) && !isa<Constant>(Op2)) {
+ IC.replaceOperand(II, 1, Op2);
+ IC.replaceOperand(II, 2, Op1);
+ return &II;
+ }
- if (IsUnitSplat(OpMultiplier)) {
- // [f]mul pg %n, (dupx 1) => %n
- OpMultiplicand->takeName(&II);
- return IC.replaceInstUsesWith(II, OpMultiplicand);
- } else if (IsUnitDup(OpMultiplier)) {
- // [f]mul pg %n, (dup pg 1) => %n
- auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
- auto *DupPg = DupInst->getOperand(1);
- // TODO: this is naive. The optimization is still valid if DupPg
- // 'encompasses' OpPredicate, not only if they're the same predicate.
- if (OpPredicate == DupPg) {
- OpMultiplicand->takeName(&II);
- return IC.replaceInstUsesWith(II, OpMultiplicand);
- }
+ // Only active lanes matter when simplifying the operation.
+ Op1 = stripInactiveLanes(Op1, Pg);
+ Op2 = stripInactiveLanes(Op2, Pg);
+
+ Value *SimpleII;
+ if (auto FII = dyn_cast<FPMathOperator>(&II))
+ SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
+ else
+ SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
+
+ if (SimpleII) {
+ if (IInfo.inactiveLanesAreNotDefined())
+ return IC.replaceInstUsesWith(II, SimpleII);
+
+ Value *Inactive =
+ II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
+
+ // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
+ if (SimpleII == Inactive)
+ return IC.replaceInstUsesWith(II, SimpleII);
+
+ // Inactive lanes must be preserved.
+ SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
+ return IC.replaceInstUsesWith(II, SimpleII);
}
return instCombineSVEVectorBinOp(IC, II);
@@ -2650,9 +2691,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fmul:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fmul_u:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
@@ -2664,9 +2705,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mul:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_mul_u:
- return instCombineSVEVectorMul(IC, II);
+ return instCombineSVEVectorMul(IC, II, IInfo);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
index f612e5a44ebba..3b37e2c1fddef 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent fmuls -- should compile to just a ret.
define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_f16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x
define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_f32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
;
%1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
%2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_f64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[A]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
%2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> splat (double 1.000000e+00)
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
@@ -48,8 +48,8 @@ define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x
define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x half> [[A]]
;
%1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, half 1.0)
%2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
index bd3d7be0a1b80..38ed4272d826c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vsca
define <vscale x 2 x double> @idempotent_fmul_u_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_
diff erent_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> [[A]])
-; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
;
%1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
; Different argument order to the above tests.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
index cbdcfc6b110b3..602db4eb1d429 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
; Idempotent muls -- should compile to just a ret.
define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_i16(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -16,8 +16,8 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8
define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_i32(
-; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
;
%1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
%2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
@@ -26,8 +26,8 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4
define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_i64(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x i64> [[A]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
%2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
@@ -37,7 +37,7 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(
; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 1)
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
@@ -48,8 +48,8 @@ define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x
define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(
-; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A]]
;
%1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, i16 1)
%2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
index 8144e56b979f0..e899c787aa555 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
@@ -36,9 +36,8 @@ define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x
define <vscale x 2 x i64> @idempotent_mul_u_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_
diff erent_argument_order(
-; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1), <vscale x 2 x i64> [[A]])
-; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
;
%1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
; Different argument order to the above tests.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
new file mode 100644
index 0000000000000..7da55a199df28
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; The follow tests verify the mechanics of simplification. The operation is not
+; important beyond being commutative with a known identity value.
+
+define <vscale x 4 x i32> @commute_constant_to_rhs(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @commute_constant_to_rhs(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[R:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 303))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 303), <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %r
+}
+
+; Inactive lanes are important, which make the operation non-commutative.
+define <vscale x 4 x i32> @cannot_commute_constant_to_rhs(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @cannot_commute_constant_to_rhs(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[R:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 303), <vscale x 4 x i32> [[A]])
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 303), <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
+ ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul_ops_reverse(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_ops_reverse(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 1)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 1), <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul_u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_u(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
+ ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @idempotent_mul_u_ops_reverse(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_u_ops_reverse(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 1), <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %r
+}
+
+; Show that we only need to know the active lanes are constant.
+; TODO: We can do better here because we can use %a directly as part of the
+; select because we know only its inactive lanes will be used.
+define <vscale x 4 x i32> @constant_mul_after_striping_inactive_lanes(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul_after_striping_inactive_lanes(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A_DUP:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], i32 3)
+; CHECK-NEXT: [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 6), <vscale x 4 x i32> [[A_DUP]]
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %a.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, i32 3)
+ %b.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i1> %pg, i32 2)
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a.dup, <vscale x 4 x i32> %b.dup)
+ ret <vscale x 4 x i32> %r
+}
+
+; Show that we only need to know the active lanes are constant.
+define <vscale x 4 x i32> @constant_mul_u_after_striping_inactive_lanes(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul_u_after_striping_inactive_lanes(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 6)
+;
+ %a.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, i32 3)
+ %b.dup = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i1> %pg, i32 2)
+ %3 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a.dup, <vscale x 4 x i32> %b.dup)
+ ret <vscale x 4 x i32> %3
+}
+
+; The follow tests demonstrate the operations for which hooks are in place to
+; enable simplification. Given the simplications themselves are common code, it
+; is assumed they are already well tested elsewhere.
+
+define <vscale x 4 x float> @constant_fmul(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @constant_fmul(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x float> splat (float 4.200000e+01), <vscale x 4 x float> splat (float 7.000000e+00)
+; CHECK-NEXT: ret <vscale x 4 x float> [[R]]
+;
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> splat (float 7.0), <vscale x 4 x float> splat (float 6.0))
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @constant_fmul_u(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @constant_fmul_u(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x float> splat (float 4.200000e+01)
+;
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> splat (float 7.0), <vscale x 4 x float> splat (float 6.0))
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x i32> @constant_mul(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[R:%.*]] = select <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 21), <vscale x 4 x i32> splat (i32 7)
+; CHECK-NEXT: ret <vscale x 4 x i32> [[R]]
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 7), <vscale x 4 x i32> splat (i32 3))
+ ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @constant_mul_u(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @constant_mul_u(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 21)
+;
+ %r = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat (i32 7), <vscale x 4 x i32> splat (i32 3))
+ ret <vscale x 4 x i32> %r
+}
+
+
+; repeat only the constant fold tests for fmul(.u)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list