[llvm] Fold SVE mul and mul_u to neg during isel (PR #160828)

Fri Oct 3 05:32:49 PDT 2025

https://github.com/MartinWehking updated https://github.com/llvm/llvm-project/pull/160828

>From 35b5002796394a246422f5cf8f82503fcc8c7451 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Thu, 25 Sep 2025 14:02:55 +0000
Subject: [PATCH 1/5] Fold SVE mul and mul_u to neg during isel

Replace mul and mul_u ops with a neg operation if their second operand
is a splat value -1.

Apply the optimization also for mul_u ops if their first operand
is a splat value -1 due to their commutativity.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  15 ++
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  13 ++
 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll  | 213 ++++++++++++++++++
 3 files changed, 241 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1e30735b7a56a..27f4e8125f067 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -730,6 +730,21 @@ let Predicates = [HasSVE_or_SME] in {
   defm ABS_ZPmZ  : sve_int_un_pred_arit<  0b110, "abs",  AArch64abs_mt>;
   defm NEG_ZPmZ  : sve_int_un_pred_arit<  0b111, "neg",  AArch64neg_mt>;
 
+  // mul x (splat -1) -> neg x
+  let Predicates = [HasSVE_or_SME] in {
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , i32>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , i32>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , i32>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , i64>;
+
+    let AddedComplexity = 5 in {
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B , i32>;
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H , i32>;
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S , i32>;
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D , i64>;
+    }
+  }
+
   defm CLS_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b000, "cls",  AArch64cls_mt>;
   defm CLZ_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b001, "clz",  AArch64clz_mt>;
   defm CNT_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b010, "cnt",  AArch64cnt_mt>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9a23c35766cac..4fdc80c2d749b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -723,6 +723,19 @@ class SVE2p1_Cvt_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out
     : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
                   (!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
 
+class SVE_2_Op_Neg_One_Passthru_Pat<ValueType vt, SDPatternOperator op, ValueType pt,
+                             Instruction inst, ValueType immT>
+: Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
+      (inst $Op2, $Op1, $Op2)>;
+
+// Same as above, but commutative
+multiclass SVE_2_Op_Neg_One_Passthru_Pat_Comm<ValueType vt, SDPatternOperator op, ValueType pt,
+                                         Instruction inst, ValueType immT> {
+def : Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
+      (inst $Op2, $Op1, $Op2)>;
+def : Pat<(vt (op pt:$Op1, (vt (splat_vector (immT -1))), vt:$Op2)),
+      (inst $Op2, $Op1, $Op2)>;
+}
 //===----------------------------------------------------------------------===//
 // SVE pattern match helpers.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
new file mode 100644
index 0000000000000..8db9eac027506
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 16 x i8> @mul_neg_fold_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
+  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %1)
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+  ; Edge case -- make sure that the case where we're multiplying two dups
+  ; together is sane.
+; CHECK-LABEL: mul_neg_fold_two_dups:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+  ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 16 x i8> @mul_neg_fold_u_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
+  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %1)
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_u_two_dups:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+  ret <vscale x 8 x i16> %3
+}
+
+; Undefined mul is commutative
+define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_u_different_argument_order:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %2
+}
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: no_mul_neg_fold_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: no_mul_neg_fold_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: no_mul_neg_fold_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+; Merge mul is non commutative
+define <vscale x 2 x i64> @no_mul_neg_fold_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: no_mul_neg_fold_different_argument_order:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: no_mul_neg_fold_u_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul z0.h, z0.h, #-2
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: no_mul_neg_fold_u_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul z0.s, z0.s, #-2
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: no_mul_neg_fold_u_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul z0.d, z0.d, #-2
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+  ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)

>From 725e1973052d65c59f601145648b385aaa13675e Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Tue, 30 Sep 2025 09:15:58 +0000
Subject: [PATCH 2/5] Lower to undef variants of neg for _u variants

Target undef variants for variants of mul where the inactive lanes do
not matter (mul.u instrinsics).

Create also test cases for all different datatypes when checking
the commutativity of mul.u.

Remove target guards since only optimization patterns are
added.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 22 ++++++------
 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll  | 35 +++++++++++++++++--
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 27f4e8125f067..37bcd3a0899ce 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -731,18 +731,16 @@ let Predicates = [HasSVE_or_SME] in {
   defm NEG_ZPmZ  : sve_int_un_pred_arit<  0b111, "neg",  AArch64neg_mt>;
 
   // mul x (splat -1) -> neg x
-  let Predicates = [HasSVE_or_SME] in {
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , i32>;
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , i32>;
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , i32>;
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , i64>;
-
-    let AddedComplexity = 5 in {
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B , i32>;
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H , i32>;
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S , i32>;
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D , i64>;
-    }
+  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , i32>;
+  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , i32>;
+  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , i32>;
+  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , i64>;
+
+  let AddedComplexity = 5 in {
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B_UNDEF, i32>;
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H_UNDEF, i32>;
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S_UNDEF, i32>;
+    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D_UNDEF, i64>;
   }
 
   defm CLS_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b000, "cls",  AArch64cls_mt>;
diff --git a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
index 8db9eac027506..cf134744b0ecc 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
@@ -111,8 +111,38 @@ define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscal
 }
 
 ; Undefined mul is commutative
-define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
-; CHECK-LABEL: mul_neg_fold_u_different_argument_order:
+define <vscale x 16 x i8> @mul_neg_fold_u_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
+  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %1, <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_u_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
@@ -120,6 +150,7 @@ define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(<vscale x 2 x
   %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %2
 }
+
 ; Non foldable muls -- we don't expect these to be optimised out.
 define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
 ; CHECK-LABEL: no_mul_neg_fold_i16:

>From 1adef4d5bb6c815c635d9e70aaa03fc1d932a738 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Wed, 1 Oct 2025 15:37:50 +0000
Subject: [PATCH 3/5] Add different argument order pattern for mul

Optimize mul for merge variants where the splat value is the first
operand.
This optimization is equal to a neg that as a (mov -1) and op2 as
operands.

Remove a redundant commutativity pattern for undef variants of mul since
the splat value should appear only as second operand after instcombine.

Adapt the tests to test for the new patterns and remove redundant ones
for non-optimization cases.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  13 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  15 +-
 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll  | 138 +++---------------
 3 files changed, 36 insertions(+), 130 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 37bcd3a0899ce..dcabcaef6afb1 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -737,10 +737,10 @@ let Predicates = [HasSVE_or_SME] in {
   def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , i64>;
 
   let AddedComplexity = 5 in {
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B_UNDEF, i32>;
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H_UNDEF, i32>;
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S_UNDEF, i32>;
-    defm : SVE_2_Op_Neg_One_Passthru_Pat_Comm<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D_UNDEF, i64>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B_UNDEF, i32>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H_UNDEF, i32>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S_UNDEF, i32>;
+    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D_UNDEF, i64>;
   }
 
   defm CLS_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b000, "cls",  AArch64cls_mt>;
@@ -1025,6 +1025,11 @@ let Predicates = [HasSVE_or_SME] in {
   defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
+
+  def : SVE_2_Op_Neg_One_Replace_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , DUP_ZI_B, i32>;
+  def : SVE_2_Op_Neg_One_Replace_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , DUP_ZI_H, i32>;
+  def : SVE_2_Op_Neg_One_Replace_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , DUP_ZI_S, i32>;
+  def : SVE_2_Op_Neg_One_Replace_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , DUP_ZI_D, i64>;
 } // End HasSVE_or_SME
 
 // COMPACT - word and doubleword
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 4fdc80c2d749b..c24062772fa6d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -723,19 +723,16 @@ class SVE2p1_Cvt_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out
     : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
                   (!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
 
-class SVE_2_Op_Neg_One_Passthru_Pat<ValueType vt, SDPatternOperator op, ValueType pt,
+class SVE_2_Op_Neg_One_Replace_Pat<ValueType vt, SDPatternOperator op,  ValueType pt,
+                             Instruction inst, Instruction dup_inst, ValueType immT>
+: Pat<(vt (op pt:$Op1, (vt (splat_vector (immT -1))), vt:$Op2)),
+      (inst (dup_inst -1, 0) , $Op1, $Op2)>;
+
+class SVE_2_Op_Neg_One_Passthru_Pat<ValueType vt, SDPatternOperator op,  ValueType pt,
                              Instruction inst, ValueType immT>
 : Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
       (inst $Op2, $Op1, $Op2)>;
 
-// Same as above, but commutative
-multiclass SVE_2_Op_Neg_One_Passthru_Pat_Comm<ValueType vt, SDPatternOperator op, ValueType pt,
-                                         Instruction inst, ValueType immT> {
-def : Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
-      (inst $Op2, $Op1, $Op2)>;
-def : Pat<(vt (op pt:$Op1, (vt (splat_vector (immT -1))), vt:$Op2)),
-      (inst $Op2, $Op1, $Op2)>;
-}
 //===----------------------------------------------------------------------===//
 // SVE pattern match helpers.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
index cf134744b0ecc..43cd032764d17 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
@@ -44,20 +44,6 @@ define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x
   ret <vscale x 2 x i64> %2
 }
 
-define <vscale x 8 x i16> @mul_neg_fold_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
-  ; Edge case -- make sure that the case where we're multiplying two dups
-  ; together is sane.
-; CHECK-LABEL: mul_neg_fold_two_dups:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    neg z0.h, p0/m, z0.h
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
-  ret <vscale x 8 x i16> %3
-}
-
 define <vscale x 16 x i8> @mul_neg_fold_u_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
 ; CHECK-LABEL: mul_neg_fold_u_i8:
 ; CHECK:       // %bb.0:
@@ -98,99 +84,47 @@ define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2
   ret <vscale x 2 x i64> %2
 }
 
-define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
-; CHECK-LABEL: mul_neg_fold_u_two_dups:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    neg z0.h, p0/m, z0.h
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
-  ret <vscale x 8 x i16> %3
-}
-
-; Undefined mul is commutative
-define <vscale x 16 x i8> @mul_neg_fold_u_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i8:
+define <vscale x 16 x i8> @mul_neg_fold_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg z0.b, p0/m, z0.b
+; CHECK-NEXT:    mov z1.b, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.b, p0/m, z0.b
+; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
-  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %1, <vscale x 16 x i8> %a)
+  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %1, <vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %2
 }
 
-define <vscale x 8 x i16> @mul_neg_fold_u_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
-; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i16:
+define <vscale x 8 x i16> @mul_neg_fold_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z1.h, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %a)
+  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %2
 }
 
-define <vscale x 4 x i32> @mul_neg_fold_u_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
-; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i32:
+define <vscale x 4 x i32> @mul_neg_fold_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z1.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.s, p0/m, z0.s
+; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
-  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %2
 }
 
-define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
-; CHECK-LABEL: mul_neg_fold_u_different_argument_order_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg z0.d, p0/m, z0.d
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
-  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
-  ret <vscale x 2 x i64> %2
-}
-
-; Non foldable muls -- we don't expect these to be optimised out.
-define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
-; CHECK-LABEL: no_mul_neg_fold_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.h, #-2 // =0xfffffffffffffffe
-; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
-  ret <vscale x 8 x i16> %2
-}
-
-define <vscale x 4 x i32> @no_mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
-; CHECK-LABEL: no_mul_neg_fold_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, #-2 // =0xfffffffffffffffe
-; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
-  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
-  ret <vscale x 4 x i32> %2
-}
-
-define <vscale x 2 x i64> @no_mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
-; CHECK-LABEL: no_mul_neg_fold_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #-2 // =0xfffffffffffffffe
-; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
-  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
-  ret <vscale x 2 x i64> %2
-}
-
-; Merge mul is non commutative
-define <vscale x 2 x i64> @no_mul_neg_fold_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
-; CHECK-LABEL: no_mul_neg_fold_different_argument_order:
+define <vscale x 2 x i64> @mul_neg_fold_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    neg z1.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
@@ -198,36 +132,6 @@ define <vscale x 2 x i64> @no_mul_neg_fold_different_argument_order(<vscale x 2
   ret <vscale x 2 x i64> %2
 }
 
-define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
-; CHECK-LABEL: no_mul_neg_fold_u_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul z0.h, z0.h, #-2
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
-  ret <vscale x 8 x i16> %2
-}
-
-define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
-; CHECK-LABEL: no_mul_neg_fold_u_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul z0.s, z0.s, #-2
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
-  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
-  ret <vscale x 4 x i32> %2
-}
-
-define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
-; CHECK-LABEL: no_mul_neg_fold_u_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul z0.d, z0.d, #-2
-; CHECK-NEXT:    ret
-  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
-  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
-  ret <vscale x 2 x i64> %2
-}
-
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)

>From 17571be13143d58c32d69a779c20ee0891227671 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Thu, 2 Oct 2025 14:50:01 +0000
Subject: [PATCH 4/5] Define Patterns directly

Do not use helper classes and create the patterns directly

Move pattern definitions into a similar spot
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 46 ++++++++++++-------
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 10 ----
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dcabcaef6afb1..2cb29b94129bb 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -730,19 +730,6 @@ let Predicates = [HasSVE_or_SME] in {
   defm ABS_ZPmZ  : sve_int_un_pred_arit<  0b110, "abs",  AArch64abs_mt>;
   defm NEG_ZPmZ  : sve_int_un_pred_arit<  0b111, "neg",  AArch64neg_mt>;
 
-  // mul x (splat -1) -> neg x
-  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , i32>;
-  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , i32>;
-  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , i32>;
-  def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , i64>;
-
-  let AddedComplexity = 5 in {
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv16i8, AArch64mul_p, nxv16i1, NEG_ZPmZ_B_UNDEF, i32>;
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv8i16, AArch64mul_p, nxv8i1, NEG_ZPmZ_H_UNDEF, i32>;
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv4i32, AArch64mul_p, nxv4i1, NEG_ZPmZ_S_UNDEF, i32>;
-    def : SVE_2_Op_Neg_One_Passthru_Pat<nxv2i64, AArch64mul_p, nxv2i1, NEG_ZPmZ_D_UNDEF, i64>;
-  }
-
   defm CLS_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b000, "cls",  AArch64cls_mt>;
   defm CLZ_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b001, "clz",  AArch64clz_mt>;
   defm CNT_ZPmZ  : sve_int_un_pred_arit_bitwise<   0b010, "cnt",  AArch64cnt_mt>;
@@ -1026,10 +1013,35 @@ let Predicates = [HasSVE_or_SME] in {
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
 
-  def : SVE_2_Op_Neg_One_Replace_Pat<nxv16i8, AArch64mul_m1, nxv16i1, NEG_ZPmZ_B , DUP_ZI_B, i32>;
-  def : SVE_2_Op_Neg_One_Replace_Pat<nxv8i16, AArch64mul_m1, nxv8i1, NEG_ZPmZ_H , DUP_ZI_H, i32>;
-  def : SVE_2_Op_Neg_One_Replace_Pat<nxv4i32, AArch64mul_m1, nxv4i1, NEG_ZPmZ_S , DUP_ZI_S, i32>;
-  def : SVE_2_Op_Neg_One_Replace_Pat<nxv2i64, AArch64mul_m1, nxv2i1, NEG_ZPmZ_D , DUP_ZI_D, i64>;
+  // mul x (splat -1) -> neg x
+  def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, nxv16i8:$Op2, (nxv16i8 (splat_vector (i32 -1))))),
+      (NEG_ZPmZ_B $Op2, $Op1, $Op2)>;
+  def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, nxv8i16:$Op2, (nxv8i16 (splat_vector (i32 -1))))),
+      (NEG_ZPmZ_H $Op2, $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, nxv4i32:$Op2, (nxv4i32 (splat_vector (i32 -1))))),
+      (NEG_ZPmZ_S $Op2, $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 -1))))),
+      (NEG_ZPmZ_D $Op2, $Op1, $Op2)>;
+
+  let AddedComplexity = 5 in {
+    def : Pat<(nxv16i8 (AArch64mul_p nxv16i1:$Op1, nxv16i8:$Op2, (nxv16i8 (splat_vector (i32 -1))))),
+        (NEG_ZPmZ_B_UNDEF $Op2, $Op1, $Op2)>;
+    def : Pat<(nxv8i16 (AArch64mul_p nxv8i1:$Op1, nxv8i16:$Op2, (nxv8i16 (splat_vector (i32 -1))))),
+        (NEG_ZPmZ_H_UNDEF $Op2, $Op1, $Op2)>;
+    def : Pat<(nxv4i32 (AArch64mul_p nxv4i1:$Op1, nxv4i32:$Op2, (nxv4i32 (splat_vector (i32 -1))))),
+        (NEG_ZPmZ_S_UNDEF $Op2, $Op1, $Op2)>;
+    def : Pat<(nxv2i64 (AArch64mul_p nxv2i1:$Op1, nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 -1))))),
+        (NEG_ZPmZ_D_UNDEF $Op2, $Op1, $Op2)>;
+  }
+
+  def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, (nxv16i8 (splat_vector (i32 -1))), nxv16i8:$Op2)),
+        (NEG_ZPmZ_B (DUP_ZI_B -1, 0) , $Op1, $Op2)>;
+  def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, (nxv8i16 (splat_vector (i32 -1))), nxv8i16:$Op2)),
+        (NEG_ZPmZ_H (DUP_ZI_H -1, 0) , $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, (nxv4i32 (splat_vector (i32 -1))), nxv4i32:$Op2)),
+        (NEG_ZPmZ_S (DUP_ZI_S -1, 0) , $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, (nxv2i64 (splat_vector (i64 -1))), nxv2i64:$Op2)),
+        (NEG_ZPmZ_D (DUP_ZI_D -1, 0) , $Op1, $Op2)>;
 } // End HasSVE_or_SME
 
 // COMPACT - word and doubleword
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c24062772fa6d..9a23c35766cac 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -723,16 +723,6 @@ class SVE2p1_Cvt_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out
     : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
                   (!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
 
-class SVE_2_Op_Neg_One_Replace_Pat<ValueType vt, SDPatternOperator op,  ValueType pt,
-                             Instruction inst, Instruction dup_inst, ValueType immT>
-: Pat<(vt (op pt:$Op1, (vt (splat_vector (immT -1))), vt:$Op2)),
-      (inst (dup_inst -1, 0) , $Op1, $Op2)>;
-
-class SVE_2_Op_Neg_One_Passthru_Pat<ValueType vt, SDPatternOperator op,  ValueType pt,
-                             Instruction inst, ValueType immT>
-: Pat<(vt (op pt:$Op1, vt:$Op2, (vt (splat_vector (immT -1))))),
-      (inst $Op2, $Op1, $Op2)>;
-
 //===----------------------------------------------------------------------===//
 // SVE pattern match helpers.
 //===----------------------------------------------------------------------===//

>From 316700d5b4bc2b09f1e61b177cd3cc0e16d0b870 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking at arm.com>
Date: Fri, 3 Oct 2025 11:26:36 +0000
Subject: [PATCH 5/5] Simplify test cases and reformat patch

Replace some dups with splat_vectors and remove whitespaces.
Remove redundant arg in run line.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  8 +--
 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll  | 67 +++++++------------
 2 files changed, 29 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2cb29b94129bb..98239f3cfe90b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1035,13 +1035,13 @@ let Predicates = [HasSVE_or_SME] in {
   }
 
   def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, (nxv16i8 (splat_vector (i32 -1))), nxv16i8:$Op2)),
-        (NEG_ZPmZ_B (DUP_ZI_B -1, 0) , $Op1, $Op2)>;
+        (NEG_ZPmZ_B (DUP_ZI_B -1, 0), $Op1, $Op2)>;
   def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, (nxv8i16 (splat_vector (i32 -1))), nxv8i16:$Op2)),
-        (NEG_ZPmZ_H (DUP_ZI_H -1, 0) , $Op1, $Op2)>;
+        (NEG_ZPmZ_H (DUP_ZI_H -1, 0), $Op1, $Op2)>;
   def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, (nxv4i32 (splat_vector (i32 -1))), nxv4i32:$Op2)),
-        (NEG_ZPmZ_S (DUP_ZI_S -1, 0) , $Op1, $Op2)>;
+        (NEG_ZPmZ_S (DUP_ZI_S -1, 0), $Op1, $Op2)>;
   def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, (nxv2i64 (splat_vector (i64 -1))), nxv2i64:$Op2)),
-        (NEG_ZPmZ_D (DUP_ZI_D -1, 0) , $Op1, $Op2)>;
+        (NEG_ZPmZ_D (DUP_ZI_D -1, 0), $Op1, $Op2)>;
 } // End HasSVE_or_SME
 
 // COMPACT - word and doubleword
diff --git a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
index 43cd032764d17..a1065bcf7d7da 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mattr=+sve < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -9,9 +9,8 @@ define <vscale x 16 x i8> @mul_neg_fold_i8(<vscale x 16 x i1> %pg, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.b, p0/m, z0.b
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
-  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %1)
-  ret <vscale x 16 x i8> %2
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 -1))
+  ret <vscale x 16 x i8> %1
 }
 
 define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
@@ -19,9 +18,8 @@ define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
-  ret <vscale x 8 x i16> %2
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 -1))
+  ret <vscale x 8 x i16> %1
 }
 
 define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
@@ -29,9 +27,8 @@ define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
-  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
-  ret <vscale x 4 x i32> %2
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 -1))
+  ret <vscale x 4 x i32> %1
 }
 
 define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
@@ -39,9 +36,8 @@ define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
-  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
-  ret <vscale x 2 x i64> %2
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 -1))
+  ret <vscale x 2 x i64> %1
 }
 
 define <vscale x 16 x i8> @mul_neg_fold_u_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
@@ -49,9 +45,8 @@ define <vscale x 16 x i8> @mul_neg_fold_u_i8(<vscale x 16 x i1> %pg, <vscale x 1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.b, p0/m, z0.b
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
-  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %1)
-  ret <vscale x 16 x i8> %2
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 -1))
+  ret <vscale x 16 x i8> %1
 }
 
 define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
@@ -59,9 +54,8 @@ define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
-  ret <vscale x 8 x i16> %2
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 -1))
+  ret <vscale x 8 x i16> %1
 }
 
 define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
@@ -69,9 +63,8 @@ define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
-  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
-  ret <vscale x 4 x i32> %2
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 -1))
+  ret <vscale x 4 x i32> %1
 }
 
 define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
@@ -79,9 +72,8 @@ define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
-  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
-  ret <vscale x 2 x i64> %2
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 -1))
+  ret <vscale x 2 x i64> %1
 }
 
 define <vscale x 16 x i8> @mul_neg_fold_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
@@ -91,9 +83,8 @@ define <vscale x 16 x i8> @mul_neg_fold_different_argument_order_i8(<vscale x 16
 ; CHECK-NEXT:    neg z1.b, p0/m, z0.b
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 -1)
-  %2 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %1, <vscale x 16 x i8> %a)
-  ret <vscale x 16 x i8> %2
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -1), <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %1
 }
 
 define <vscale x 8 x i16> @mul_neg_fold_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
@@ -103,9 +94,8 @@ define <vscale x 8 x i16> @mul_neg_fold_different_argument_order_i16(<vscale x 8
 ; CHECK-NEXT:    neg z1.h, p0/m, z0.h
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
-  %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %a)
-  ret <vscale x 8 x i16> %2
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -1), <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %1
 }
 
 define <vscale x 4 x i32> @mul_neg_fold_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
@@ -115,9 +105,8 @@ define <vscale x 4 x i32> @mul_neg_fold_different_argument_order_i32(<vscale x 4
 ; CHECK-NEXT:    neg z1.s, p0/m, z0.s
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
-  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a)
-  ret <vscale x 4 x i32> %2
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -1), <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %1
 }
 
 define <vscale x 2 x i64> @mul_neg_fold_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
@@ -127,16 +116,10 @@ define <vscale x 2 x i64> @mul_neg_fold_different_argument_order_i64(<vscale x 2
 ; CHECK-NEXT:    neg z1.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
-  %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
-  ret <vscale x 2 x i64> %2
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -1), <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %1
 }
 
-declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
-declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
-declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
-
 declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)