[llvm] [AARCH64] Fold sve mul intrinsics using -1 to neg (PR #156906)

Thu Sep 4 08:03:58 PDT 2025

https://github.com/MartinWehking created https://github.com/llvm/llvm-project/pull/156906

Fold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if one of their operands is a splat value = -1

Do not perform this optimization for sve mul intrinsics handling floating point numbers siimilar to gcc.
I copied this particular behaviour and this might need follow-up.

This change adds a new instruction that is inserted and can therefore not be integrated into the changes introduced by c192737 and 1997073.

>From 3f270aad8d6194f9f9a987d265df0f283f217d28 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Thu, 4 Sep 2025 14:08:01 +0000
Subject: [PATCH] [AARCH64] Fold sve mul intrinsics using -1 to neg

Fold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if
one of their operands is a splat value = -1

Do not perform this optimization for sve mul intrinsics handling
floating point numbers siimilar to gcc.
I copied this particular behaviour and this might need follow-up.

This change adds a new instruction that is inserted and can therefore
not be integrated into the changes introduced by c192737 and 1997073.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  36 ++++++
 .../AArch64/sve-intrinsic-mul-to-neg-fold.ll  | 108 ++++++++++++++++++
 .../sve-intrinsic-mul_u-to-neg-fold.ll        | 107 +++++++++++++++++
 3 files changed, 251 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 66272988889a2..532247c8f3b40 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2277,6 +2277,39 @@ static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   }
 }
 
+static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
+                                                            IntrinsicInst &II) {
+  Value *PG = II.getOperand(0);
+  Value *Op1 = II.getOperand(1);
+  Value *Op2 = II.getOperand(2);
+
+  // Return true if a given instruction is a negative unit splat value, false
+  // otherwise.
+  auto IsNegUnitSplat = [](auto *I) {
+    auto *SplatValue = getSplatValue(I);
+    ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
+    if (!SplatConstantInt)
+      return false;
+    APInt SCIV = SplatConstantInt->getValue();
+    const int64_t IntValue = SCIV.getSExtValue();
+    return IntValue == -1;
+  };
+
+  if (IsNegUnitSplat(Op1)) {
+    auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+                                           {II.getType()}, {Op2, PG, Op2});
+    return IC.replaceInstUsesWith(II, NEG);
+  }
+
+  if (IsNegUnitSplat(Op2)) {
+    auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+                                           {II.getType()}, {Op1, PG, Op1});
+    return IC.replaceInstUsesWith(II, NEG);
+  }
+
+  return std::nullopt;
+}
+
 static std::optional<Instruction *>
 instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
   // Bail due to missing support for ISD::STRICT_ scalable vector operations.
@@ -2852,6 +2885,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
                                              Intrinsic::aarch64_sve_mla_u>(
         IC, II, true);
+  case Intrinsic::aarch64_sve_mul:
+  case Intrinsic::aarch64_sve_mul_u:
+    return instCombineSVEVectorMul(IC, II);
   case Intrinsic::aarch64_sve_sub:
     return instCombineSVEVectorSub(IC, II);
   case Intrinsic::aarch64_sve_sub_u:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
new file mode 100644
index 0000000000000..a620fee7222ab
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+;
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @mul_neg_fold_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+;
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+  ; Edge case -- make sure that the case where we're multiplying two dups
+  ; together is sane.
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1), <vscale x 8 x i16> splat (i16 -1)
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+;
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+  ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+;
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  ; Different argument order to the above tests.
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %2
+}
+
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @no_mul_neg_fold_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> splat (i16 -2))
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+;
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @no_mul_neg_fold_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 -2))
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @no_mul_neg_fold_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 -2))
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+;
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
new file mode 100644
index 0000000000000..ee179a57a0cae
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+;
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @mul_neg_fold_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+;
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+  ; Edge case -- make sure that the case where we're multiplying two dups
+  ; together is sane.
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> splat (i16 1)
+;
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+  ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+;
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  ; Different argument order to the above tests.
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %2
+}
+
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> splat (i16 -2))
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+;
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 -2))
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 -2))
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+;
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }