[llvm] 14bd44e - [AArch64][SVEIntrinsicOpts] Factor out redundant SVE mul/fmul intrinsics
Joe Ellis via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 16 07:50:45 PDT 2021
Author: Joe Ellis
Date: 2021-03-16T14:50:17Z
New Revision: 14bd44edc6afbb2bf7c823750b3d0f4e15fb02c8
URL: https://github.com/llvm/llvm-project/commit/14bd44edc6afbb2bf7c823750b3d0f4e15fb02c8
DIFF: https://github.com/llvm/llvm-project/commit/14bd44edc6afbb2bf7c823750b3d0f4e15fb02c8.diff
LOG: [AArch64][SVEIntrinsicOpts] Factor out redundant SVE mul/fmul intrinsics
This commit implements an IR-level optimization to eliminate idempotent
SVE mul/fmul intrinsic calls. Currently, the following patterns are
captured:
fmul pg (dup_x 1.0) V => V
mul pg (dup_x 1) V => V
fmul pg V (dup_x 1.0) => V
mul pg V (dup_x 1) => V
fmul pg V (dup v pg 1.0) => V
mul pg V (dup v pg 1) => V
The result of this commit is that code such as:
1 #include <arm_sve.h>
2
3 svfloat64_t foo(svfloat64_t a) {
4 svbool_t t = svptrue_b64();
5 svfloat64_t b = svdup_f64(1.0);
6 return svmul_m(t, a, b);
7 }
will lower to a nop.
This commit does not capture all possibilities; only the simple cases
described above. There is still room for further optimisation.
Differential Revision: https://reviews.llvm.org/D98033
Added:
llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll
llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll
Modified:
llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 3d9080f7997d..6b8cb786bb6c 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -77,6 +77,7 @@ struct SVEIntrinsicOpts : public ModulePass {
static bool optimizeConvertFromSVBool(IntrinsicInst *I);
static bool optimizePTest(IntrinsicInst *I);
+ static bool optimizeVectorMul(IntrinsicInst *I);
static bool processPhiNode(IntrinsicInst *I);
};
@@ -366,6 +367,76 @@ bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
return false;
}
+bool SVEIntrinsicOpts::optimizeVectorMul(IntrinsicInst *I) {
+ assert((I->getIntrinsicID() == Intrinsic::aarch64_sve_mul ||
+ I->getIntrinsicID() == Intrinsic::aarch64_sve_fmul) &&
+ "Unexpected opcode");
+
+ auto *OpPredicate = I->getOperand(0);
+ auto *OpMultiplicand = I->getOperand(1);
+ auto *OpMultiplier = I->getOperand(2);
+
+ // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
+ // with a unit splat value, false otherwise.
+ auto IsUnitDupX = [](auto *I) {
+ auto *IntrI = dyn_cast<IntrinsicInst>(I);
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+ return false;
+
+ auto *SplatValue = IntrI->getOperand(0);
+ return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
+ };
+
+ // Return true if a given instruction is an aarch64_sve_dup intrinsic call
+ // with a unit splat value, false otherwise.
+ auto IsUnitDup = [](auto *I) {
+ auto *IntrI = dyn_cast<IntrinsicInst>(I);
+ if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
+ return false;
+
+ auto *SplatValue = IntrI->getOperand(2);
+ return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
+ };
+
+ bool Changed = true;
+
+ // The OpMultiplier variable should always point to the dup (if any), so
+ // swap if necessary.
+ if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
+ std::swap(OpMultiplier, OpMultiplicand);
+
+ if (IsUnitDupX(OpMultiplier)) {
+ // [f]mul pg (dupx 1) %n => %n
+ I->replaceAllUsesWith(OpMultiplicand);
+ I->eraseFromParent();
+ Changed = true;
+ } else if (IsUnitDup(OpMultiplier)) {
+ // [f]mul pg (dup pg 1) %n => %n
+ auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
+ auto *DupPg = DupInst->getOperand(1);
+ // TODO: this is naive. The optimization is still valid if DupPg
+ // 'encompasses' OpPredicate, not only if they're the same predicate.
+ if (OpPredicate == DupPg) {
+ I->replaceAllUsesWith(OpMultiplicand);
+ I->eraseFromParent();
+ Changed = true;
+ }
+ }
+
+ // If an instruction was optimized out then it is possible that some dangling
+ // instructions are left.
+ if (Changed) {
+ auto *OpPredicateInst = dyn_cast<Instruction>(OpPredicate);
+ auto *OpMultiplierInst = dyn_cast<Instruction>(OpMultiplier);
+ if (OpMultiplierInst && OpMultiplierInst->use_empty())
+ OpMultiplierInst->eraseFromParent();
+ if (OpPredicateInst && OpPredicateInst->use_empty())
+ OpPredicateInst->eraseFromParent();
+ }
+
+ return Changed;
+}
+
bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
"Unexpected opcode");
@@ -429,6 +500,9 @@ bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
switch (IntrI->getIntrinsicID()) {
case Intrinsic::aarch64_sve_convert_from_svbool:
return optimizeConvertFromSVBool(IntrI);
+ case Intrinsic::aarch64_sve_fmul:
+ case Intrinsic::aarch64_sve_mul:
+ return optimizeVectorMul(IntrI);
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
@@ -484,6 +558,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
case Intrinsic::aarch64_sve_ptrue:
+ case Intrinsic::aarch64_sve_mul:
+ case Intrinsic::aarch64_sve_fmul:
for (User *U : F.users())
Functions.insert(cast<Instruction>(U)->getFunction());
break;
diff --git a/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll b/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll
new file mode 100644
index 000000000000..e716aa091c61
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Idempotent fmuls -- should compile to just a ret.
+define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+; CHECK-LABEL: @idempotent_fmul_f16(
+; CHECK-NEXT: ret <vscale x 8 x half> [[A:%.*]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
+; CHECK-LABEL: @idempotent_fmul_f32(
+; CHECK-NEXT: ret <vscale x 4 x float> [[A:%.*]]
+;
+ %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
+ %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+ ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: @idempotent_fmul_f64(
+; CHECK-NEXT: ret <vscale x 2 x double> [[A:%.*]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: @idempotent_fmul_
diff erent_argument_order(
+; CHECK-NEXT: ret <vscale x 2 x double> [[A:%.*]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %1, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+; CHECK-LABEL: @idempotent_fmul_with_predicated_dup(
+; CHECK-NEXT: ret <vscale x 8 x half> [[A:%.*]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half 1.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+ ; Edge case -- make sure that the case where we're fmultiplying two dups
+ ; together is sane.
+; CHECK-LABEL: @idempotent_fmul_two_dups(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH3C00)
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+ %3 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+ ret <vscale x 8 x half> %3
+}
+
+; Non-idempotent fmuls -- we don't expect these to be optimised out.
+define <vscale x 8 x half> @non_idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+; CHECK-LABEL: @non_idempotent_fmul_f16(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH4000)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @non_idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
+; CHECK-LABEL: @non_idempotent_fmul_f32(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+;
+ %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0)
+ %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+ ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: @non_idempotent_fmul_f64(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0)
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) {
+ ; Different predicates
+; CHECK-LABEL: @non_idempotent_fmul_with_predicated_dup(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg1, double 1.0)
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg2, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+ ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll b/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll
new file mode 100644
index 000000000000..d07e100f9d57
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Idempotent muls -- should compile to just a ret.
+define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: @idempotent_mul_i16(
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A:%.*]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: @idempotent_mul_i32(
+; CHECK-NEXT: ret <vscale x 4 x i32> [[A:%.*]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: @idempotent_mul_i64(
+; CHECK-NEXT: ret <vscale x 2 x i64> [[A:%.*]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: @idempotent_mul_
diff erent_argument_order(
+; CHECK-NEXT: ret <vscale x 2 x i64> [[A:%.*]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: @idempotent_mul_with_predicated_dup(
+; CHECK-NEXT: ret <vscale x 8 x i16> [[A:%.*]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x i16> @idempotent_mul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+ ; Edge case -- make sure that the case where we're multiplying two dups
+ ; together is sane.
+; CHECK-LABEL: @idempotent_mul_two_dups(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
+ %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+ ret <vscale x 8 x i16> %3
+}
+
+; Non-idempotent muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @non_idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: @non_idempotent_mul_i16(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 2)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 2)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @non_idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: @non_idempotent_mul_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 2)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 2)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @non_idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: @non_idempotent_mul_i64(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 2)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 2)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @non_idempotent_mul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x i64> %a) {
+ ; Different predicates
+; CHECK-LABEL: @non_idempotent_mul_with_predicated_dup(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG1:%.*]], i64 1)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg1, i64 1)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg2, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
More information about the llvm-commits
mailing list