[llvm] dc63b35 - [SVE ACLE] Extend IR combines for fmul, fsub, fadd to cover _u variants
Jolanta Jensen via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 2 04:07:52 PDT 2023
Author: Jolanta Jensen
Date: 2023-06-02T11:06:57Z
New Revision: dc63b35b02231a75d131fb6376d2e58a7ad9b7e4
URL: https://github.com/llvm/llvm-project/commit/dc63b35b02231a75d131fb6376d2e58a7ad9b7e4
DIFF: https://github.com/llvm/llvm-project/commit/dc63b35b02231a75d131fb6376d2e58a7ad9b7e4.diff
LOG: [SVE ACLE] Extend IR combines for fmul, fsub, fadd to cover _u variants
This patch extends existing IR combines for: fmul, fsub and fadd,
relying on all active predicate to also apply to their equivalent
undef (_u) intrinsics.
Differential Revision: https://reviews.llvm.org/D150768
Added:
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
Modified:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index efd96628d4b86..84e62a888a875 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1243,10 +1243,13 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
switch (Intrinsic) {
case Intrinsic::aarch64_sve_fmul:
+ case Intrinsic::aarch64_sve_fmul_u:
return Instruction::BinaryOps::FMul;
case Intrinsic::aarch64_sve_fadd:
+ case Intrinsic::aarch64_sve_fadd_u:
return Instruction::BinaryOps::FAdd;
case Intrinsic::aarch64_sve_fsub:
+ case Intrinsic::aarch64_sve_fsub_u:
return Instruction::BinaryOps::FSub;
default:
return Instruction::BinaryOpsEnd;
@@ -1292,6 +1295,11 @@ static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
Intrinsic::aarch64_sve_mad>(
IC, II, false))
return MAD;
+ if (auto FMLA_U =
+ instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
+ Intrinsic::aarch64_sve_fmla_u>(
+ IC, II, true))
+ return FMLA_U;
return instCombineSVEVectorBinOp(IC, II);
}
@@ -1311,6 +1319,11 @@ static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
Intrinsic::aarch64_sve_fnmsb>(
IC, II, false))
return FMSB;
+ if (auto FMLS_U =
+ instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
+ Intrinsic::aarch64_sve_fmls_u>(
+ IC, II, true))
+ return FMLS_U;
return instCombineSVEVectorBinOp(IC, II);
}
@@ -1684,25 +1697,20 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_fmul:
+ case Intrinsic::aarch64_sve_fmul_u:
return instCombineSVEVectorMul(IC, II);
case Intrinsic::aarch64_sve_fadd:
+ case Intrinsic::aarch64_sve_fadd_u:
case Intrinsic::aarch64_sve_add:
return instCombineSVEVectorAdd(IC, II);
- case Intrinsic::aarch64_sve_fadd_u:
- return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
- Intrinsic::aarch64_sve_fmla_u>(
- IC, II, true);
case Intrinsic::aarch64_sve_add_u:
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_fsub:
+ case Intrinsic::aarch64_sve_fsub_u:
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
- case Intrinsic::aarch64_sve_fsub_u:
- return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
- Intrinsic::aarch64_sve_fmls_u>(
- IC, II, true);
case Intrinsic::aarch64_sve_sub_u:
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mls_u>(
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
index b2d5668f0b7bf..55810620a3b10 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fma-binops.ll
@@ -6,7 +6,7 @@ declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
-; SVE intrinsics fmul and fadd should be replaced with regular fmul and fadd
+; SVE intrinsics fmul, fmul_u, fadd, fadd_u, fsub and fsub_u should be replaced with regular fmul, fadd and fsub.
declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
define <vscale x 8 x half> @replace_fmul_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
; CHECK-LABEL: @replace_fmul_intrinsic_half
@@ -37,6 +37,36 @@ define <vscale x 2 x double> @replace_fmul_intrinsic_double(<vscale x 2 x double
ret <vscale x 2 x double> %2
}
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fmul_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_half
+; CHECK-NEXT: %1 = fmul fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT: ret <vscale x 8 x half> %1
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fmul_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_float
+; CHECK-NEXT: %1 = fmul fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT: ret <vscale x 4 x float> %1
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fmul_u_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fmul_u_intrinsic_double
+; CHECK-NEXT: %1 = fmul fast <vscale x 2 x double> %a, %b
+; CHECK-NEXT: ret <vscale x 2 x double> %1
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %2
+}
+
declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
define <vscale x 8 x half> @replace_fadd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
; CHECK-LABEL: @replace_fadd_intrinsic_half
@@ -67,6 +97,36 @@ define <vscale x 2 x double> @replace_fadd_intrinsic_double(<vscale x 2 x double
ret <vscale x 2 x double> %2
}
+declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fadd_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_half
+; CHECK-NEXT: %1 = fadd fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT: ret <vscale x 8 x half> %1
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fadd_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_float
+; CHECK-NEXT: %1 = fadd fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT: ret <vscale x 4 x float> %1
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fadd.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fadd_u_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fadd_u_intrinsic_double
+; CHECK-NEXT: %1 = fadd fast <vscale x 2 x double> %a, %b
+; CHECK-NEXT: ret <vscale x 2 x double> %1
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %2
+}
+
declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
define <vscale x 8 x half> @replace_fsub_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
; CHECK-LABEL: @replace_fsub_intrinsic_half
@@ -87,7 +147,6 @@ define <vscale x 4 x float> @replace_fsub_intrinsic_float(<vscale x 4 x float> %
ret <vscale x 4 x float> %2
}
-
declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
define <vscale x 2 x double> @replace_fsub_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
; CHECK-LABEL: @replace_fsub_intrinsic_double
@@ -117,4 +176,44 @@ define <vscale x 2 x double> @replace_fsub_intrinsic_no_fast_flag(<vscale x 2 x
ret <vscale x 2 x double> %2
}
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+define <vscale x 8 x half> @replace_fsub_u_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_half
+; CHECK-NEXT: %1 = fsub fast <vscale x 8 x half> %a, %b
+; CHECK-NEXT: ret <vscale x 8 x half> %1
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x half> %2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+define <vscale x 4 x float> @replace_fsub_u_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_float
+; CHECK-NEXT: %1 = fsub fast <vscale x 4 x float> %a, %b
+; CHECK-NEXT: ret <vscale x 4 x float> %1
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %2
+}
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+define <vscale x 2 x double> @replace_fsub_u_intrinsic_no_fast_flag(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @replace_fsub_u_intrinsic_no_fast_flag
+; CHECK-NEXT: %1 = fsub <vscale x 2 x double> %a, %b
+; CHECK-NEXT: ret <vscale x 2 x double> %1
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @no_replace_on_non_ptrue_all_u(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: @no_replace_on_non_ptrue_all_u
+; CHECK-NEXT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+; CHECK-NEXT: %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+; CHECK-NEXT: ret <vscale x 2 x double> %2
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 5)
+ %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 2 x double> %2
+}
+
attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
new file mode 100644
index 0000000000000..4ab1f954b33e9
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -0,0 +1,119 @@
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Idempotent fmuls_u -- should compile to just a ret.
+define <vscale x 8 x half> @idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f16(
+; CHECK-NEXT: ret <vscale x 8 x half> [[A:%.*]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f32(
+; CHECK-NEXT: ret <vscale x 4 x float> [[A:%.*]]
+;
+ %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
+ %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+ ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_f64(
+; CHECK-NEXT: ret <vscale x 2 x double> [[A:%.*]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @idempotent_fmul_u_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_
diff erent_argument_order(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A:%.*]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %1, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_u_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @idempotent_fmul_u_with_predicated_dup(
+; CHECK-NEXT: ret <vscale x 8 x half> [[A:%.*]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half 1.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 8 x half> @idempotent_fmul_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+ ; Edge case -- make sure that the case where we're fmultiplying two dups
+ ; together is sane.
+; CHECK-LABEL: @idempotent_fmul_u_two_dups(
+; CHECK-NEXT: ret <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
+ %3 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+ ret <vscale x 8 x half> %3
+}
+
+; Non-idempotent fmuls_u -- we don't expect these to be optimised out.
+define <vscale x 8 x half> @non_idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f16(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+;
+ %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0)
+ %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
+ ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x float> @non_idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f32(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+;
+ %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0)
+ %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
+ ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: @non_idempotent_fmul_u_f64(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0)
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+ ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @non_idempotent_fmul_u_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) #0 {
+ ; Different predicates
+; CHECK-LABEL: @non_idempotent_fmul_u_with_predicated_dup(
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[TMP2]]
+;
+ %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg1, double 1.0)
+ %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg2, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
+ ret <vscale x 2 x double> %2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list