[llvm] dc63b35 - [SVE ACLE] Extend IR combines for fmul, fsub, fadd to cover _u variants

Fri Jun 2 04:07:52 PDT 2023

Author: Jolanta Jensen
Date: 2023-06-02T11:06:57Z
New Revision: dc63b35b02231a75d131fb6376d2e58a7ad9b7e4

URL: https://github.com/llvm/llvm-project/commit/dc63b35b02231a75d131fb6376d2e58a7ad9b7e4
DIFF: https://github.com/llvm/llvm-project/commit/dc63b35b02231a75d131fb6376d2e58a7ad9b7e4.diff

LOG: [SVE ACLE] Extend IR combines for fmul, fsub, fadd to cover _u variants

This patch extends existing IR combines for: fmul, fsub and fadd,
relying on all active predicate to also apply to their equivalent
undef (_u) intrinsics.

Differential Revision: https://reviews.llvm.org/D150768

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index efd96628d4b86..84e62a888a875 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1243,10 +1243,13 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_fmul:
+  case Intrinsic::aarch64_sve_fmul_u:
     return Instruction::BinaryOps::FMul;
   case Intrinsic::aarch64_sve_fadd:
+  case Intrinsic::aarch64_sve_fadd_u:
     return Instruction::BinaryOps::FAdd;
   case Intrinsic::aarch64_sve_fsub:
+  case Intrinsic::aarch64_sve_fsub_u:
     return Instruction::BinaryOps::FSub;
   default:
     return Instruction::BinaryOpsEnd;
@@ -1292,6 +1295,11 @@ static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
                                                    Intrinsic::aarch64_sve_mad>(
           IC, II, false))
     return MAD;
+  if (auto FMLA_U =
+          instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
+                                            Intrinsic::aarch64_sve_fmla_u>(
+              IC, II, true))
+    return FMLA_U;
   return instCombineSVEVectorBinOp(IC, II);
 }
 
@@ -1311,6 +1319,11 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
                                             Intrinsic::aarch64_sve_fnmsb>(
               IC, II, false))
     return FMSB;
+  if (auto FMLS_U =
+          instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
+                                            Intrinsic::aarch64_sve_fmls_u>(
+              IC, II, true))
+    return FMLS_U;
   return instCombineSVEVectorBinOp(IC, II);
 }
 
@@ -1684,25 +1697,20 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEPTest(IC, II);
   case Intrinsic::aarch64_sve_mul:
   case Intrinsic::aarch64_sve_fmul:
+  case Intrinsic::aarch64_sve_fmul_u:
     return instCombineSVEVectorMul(IC, II);
   case Intrinsic::aarch64_sve_fadd:
+  case Intrinsic::aarch64_sve_fadd_u:
   case Intrinsic::aarch64_sve_add:
     return instCombineSVEVectorAdd(IC, II);
-  case Intrinsic::aarch64_sve_fadd_u:
-    return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
-                                             Intrinsic::aarch64_sve_fmla_u>(
-        IC, II, true);
   case Intrinsic::aarch64_sve_add_u:
     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
                                              Intrinsic::aarch64_sve_mla_u>(
         IC, II, true);
   case Intrinsic::aarch64_sve_fsub:
+  case Intrinsic::aarch64_sve_fsub_u:
   case Intrinsic::aarch64_sve_sub:
     return instCombineSVEVectorSub(IC, II);
-  case Intrinsic::aarch64_sve_fsub_u:
-    return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
-                                             Intrinsic::aarch64_sve_fmls_u>(
-        IC, II, true);
   case Intrinsic::aarch64_sve_sub_u:
     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
                                              Intrinsic::aarch64_sve_mls_u>(

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
index b2d5668f0b7bf..55810620a3b10 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
@@ -6,7 +6,7 @@ declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
 declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
 declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
 
-; SVE intrinsics fmul and fadd should be replaced with regular fmul and fadd
+; SVE intrinsics fmul, fmul_u, fadd, fadd_u, fsub and fsub_u should be replaced with regular fmul, fadd and fsub.
 declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 define <vscale x 8 x half> @replace_fmul_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: @replace_fmul_intrinsic_half
@@ -37,6 +37,36 @@ define <vscale x 2 x double> @replace_fmul_intrinsic_double(<vscale x 2 x double
   ret <vscale x 2 x double> %2
 }
 
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmul_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_half
+; CHECK-NEXT:  %1 = fmul fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT:  ret <vscale x 8 x half> %1
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmul_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_float
+; CHECK-NEXT:  %1 = fmul fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT:  ret <vscale x 4 x float> %1
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmul_u_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_double
+; CHECK-NEXT:  %1 = fmul fast <vscale x 2 x double> %a, %b
+; CHECK-NEXT:  ret <vscale x 2 x double> %1
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 define <vscale x 8 x half> @replace_fadd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: @replace_fadd_intrinsic_half
@@ -67,6 +97,36 @@ define <vscale x 2 x double> @replace_fadd_intrinsic_double(<vscale x 2 x double
   ret <vscale x 2 x double> %2
 }
 
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fadd_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_half
+; CHECK-NEXT:  %1 = fadd fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT:  ret <vscale x 8 x half> %1
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fadd_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_float
+; CHECK-NEXT:  %1 = fadd fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT:  ret <vscale x 4 x float> %1
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fadd.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fadd_u_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_double
+; CHECK-NEXT:  %1 = fadd fast <vscale x 2 x double> %a, %b
+; CHECK-NEXT:  ret <vscale x 2 x double> %1
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 define <vscale x 8 x half> @replace_fsub_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: @replace_fsub_intrinsic_half
@@ -87,7 +147,6 @@ define <vscale x 4 x float> @replace_fsub_intrinsic_float(<vscale x 4 x float> %
   ret <vscale x 4 x float> %2
 }
 
-
 declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 define <vscale x 2 x double> @replace_fsub_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: @replace_fsub_intrinsic_double
@@ -117,4 +176,44 @@ define <vscale x 2 x double> @replace_fsub_intrinsic_no_fast_flag(<vscale x 2 x
   ret <vscale x 2 x double> %2
 }
 
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fsub_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_half
+; CHECK-NEXT:  %1 = fsub fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT:  ret <vscale x 8 x half> %1
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fsub_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_float
+; CHECK-NEXT:  %1 = fsub fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT:  ret <vscale x 4 x float> %1
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fsub_u_intrinsic_no_fast_flag(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_no_fast_flag
+; CHECK-NEXT:  %1 = fsub <vscale x 2 x double> %a, %b
+; CHECK-NEXT:  ret <vscale x 2 x double> %1
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_on_non_ptrue_all_u(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @no_replace_on_non_ptrue_all_u
+; CHECK-NEXT:  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT:  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+; CHECK-NEXT:  ret <vscale x 2 x double> %2
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+  %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %2
+}
+
 attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
new file mode 100644
index 0000000000000..4ab1f954b33e9
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -0,0 +1,119 @@
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Idempotent fmuls_u -- should compile to just a ret.
+define <vscale x 8 x half> @idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f16(
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f32(
+; CHECK-NEXT:    ret <vscale x 4 x float> [[A:%.*]]
+;
+  %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
+  %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f64(
+; CHECK-NEXT:    ret <vscale x 2 x double> [[A:%.*]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_u_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_
diff erent_argument_order(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+  ; Different argument order to the above tests.
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %1, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_u_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_with_predicated_dup(
+; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half 1.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+  ; Edge case -- make sure that the case where we're fmultiplying two dups
+  ; together is sane.
+; CHECK-LABEL: @idempotent_fmul_u_two_dups(
+; CHECK-NEXT:    ret <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+  %3 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+  ret <vscale x 8 x half> %3
+}
+
+; Non-idempotent fmuls_u -- we don't expect these to be optimised out.
+define <vscale x 8 x half> @non_idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+;
+  %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0)
+  %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @non_idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0)
+  %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0)
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_u_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) #0 {
+  ; Different predicates
+; CHECK-LABEL: @non_idempotent_fmul_u_with_predicated_dup(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+;
+  %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg1, double 1.0)
+  %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg2, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+  ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+attributes #0 = { "target-features"="+sve" }