[llvm] [AARCH64] Fold sve mul intrinsics using -1 to neg (PR #156906)
Martin Wehking via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 08:03:58 PDT 2025
https://github.com/MartinWehking created https://github.com/llvm/llvm-project/pull/156906
Fold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if one of their operands is a splat value = -1
Do not perform this optimization for sve mul intrinsics handling floating point numbers siimilar to gcc.
I copied this particular behaviour and this might need follow-up.
This change adds a new instruction that is inserted and can therefore not be integrated into the changes introduced by c192737 and 1997073.
>From 3f270aad8d6194f9f9a987d265df0f283f217d28 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Thu, 4 Sep 2025 14:08:01 +0000
Subject: [PATCH] [AARCH64] Fold sve mul intrinsics using -1 to neg
Fold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if
one of their operands is a splat value = -1
Do not perform this optimization for sve mul intrinsics handling
floating point numbers siimilar to gcc.
I copied this particular behaviour and this might need follow-up.
This change adds a new instruction that is inserted and can therefore
not be integrated into the changes introduced by c192737 and 1997073.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 36 ++++++
.../AArch64/sve-intrinsic-mul-to-neg-fold.ll | 108 ++++++++++++++++++
.../sve-intrinsic-mul_u-to-neg-fold.ll | 107 +++++++++++++++++
3 files changed, 251 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 66272988889a2..532247c8f3b40 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2277,6 +2277,39 @@ static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
}
}
+static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *PG = II.getOperand(0);
+ Value *Op1 = II.getOperand(1);
+ Value *Op2 = II.getOperand(2);
+
+ // Return true if a given instruction is a negative unit splat value, false
+ // otherwise.
+ auto IsNegUnitSplat = [](auto *I) {
+ auto *SplatValue = getSplatValue(I);
+ ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
+ if (!SplatConstantInt)
+ return false;
+ APInt SCIV = SplatConstantInt->getValue();
+ const int64_t IntValue = SCIV.getSExtValue();
+ return IntValue == -1;
+ };
+
+ if (IsNegUnitSplat(Op1)) {
+ auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+ {II.getType()}, {Op2, PG, Op2});
+ return IC.replaceInstUsesWith(II, NEG);
+ }
+
+ if (IsNegUnitSplat(Op2)) {
+ auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+ {II.getType()}, {Op1, PG, Op1});
+ return IC.replaceInstUsesWith(II, NEG);
+ }
+
+ return std::nullopt;
+}
+
static std::optional<Instruction *>
instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
// Bail due to missing support for ISD::STRICT_ scalable vector operations.
@@ -2852,6 +2885,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
+ case Intrinsic::aarch64_sve_mul:
+ case Intrinsic::aarch64_sve_mul_u:
+ return instCombineSVEVectorMul(IC, II);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
new file mode 100644
index 0000000000000..a620fee7222ab
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]])
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @mul_neg_fold_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]])
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+ ; Edge case -- make sure that the case where we're multiplying two dups
+ ; together is sane.
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1), <vscale x 8 x i16> splat (i16 -1)
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+ ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+ ret <vscale x 2 x i64> %2
+}
+
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @no_mul_neg_fold_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> splat (i16 -2))
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @no_mul_neg_fold_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 -2))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @no_mul_neg_fold_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 -2))
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
new file mode 100644
index 0000000000000..ee179a57a0cae
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]])
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @mul_neg_fold_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]])
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+ ; Edge case -- make sure that the case where we're multiplying two dups
+ ; together is sane.
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> splat (i16 1)
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+ ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+ ret <vscale x 2 x i64> %2
+}
+
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> splat (i16 -2))
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 -2))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 -2))
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list