[llvm] 8bad8a4 - [AArch64][SVE] Add patterns to generate FMLA/FMLS/FNMLA/FNMLS/FMAD
Bradley Smith via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 18 08:55:36 PST 2021
Author: Bradley Smith
Date: 2021-02-18T16:55:16Z
New Revision: 8bad8a43c339729bf722d519c3a25708a54bc205
URL: https://github.com/llvm/llvm-project/commit/8bad8a43c339729bf722d519c3a25708a54bc205
DIFF: https://github.com/llvm/llvm-project/commit/8bad8a43c339729bf722d519c3a25708a54bc205.diff
LOG: [AArch64][SVE] Add patterns to generate FMLA/FMLS/FNMLA/FNMLS/FMAD
Adjust generateFMAsInMachineCombiner to return false if SVE is present
in order to combine fmul+fadd into fma. Also add new pseudo instructions
so as to select the most appropriate of FMLA/FMAD depending on register
allocation.
Depends on D96599
Differential Revision: https://reviews.llvm.org/D96424
Added:
llvm/test/CodeGen/AArch64/sve-fp-combine.ll
Modified:
llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
llvm/lib/Target/AArch64/SVEInstrFormats.td
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
llvm/test/CodeGen/AArch64/sve-fp.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
index 78f6fc6656fa..722c3275fd06 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -155,12 +155,6 @@ class SelectionDAGTargetInfo {
return SDValue();
}
- // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather
- // than FMUL and ADD is delegated to the machine combiner.
- virtual bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const {
- return false;
- }
-
// Return true if the DAG Combiner should disable generic combines.
virtual bool disableGenericCombines(CodeGenOpt::Level OptLevel) const {
return false;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 002b52629208..5d090f232113 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2711,6 +2711,13 @@ class TargetLoweringBase {
return isOperationLegal(ISD::FMAD, N->getValueType(0));
}
+ // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather
+ // than FMUL and ADD is delegated to the machine combiner.
+ virtual bool generateFMAsInMachineCombiner(EVT VT,
+ CodeGenOpt::Level OptLevel) const {
+ return false;
+ }
+
/// Return true if it's profitable to narrow operations of type VT1 to
/// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
/// i32 to i16.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6a04ba7b380b..737997a3eae6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12618,7 +12618,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
if (!AllowFusionGlobally && !isContractable(N))
return SDValue();
- if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
+ if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
return SDValue();
// Always prefer FMAD to FMA for precision.
@@ -12827,7 +12827,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
if (!AllowFusionGlobally && !isContractable(N))
return SDValue();
- if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
+ if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
return SDValue();
// Always prefer FMAD to FMA for precision.
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index f72d3cfcc6a4..c96777e57beb 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -406,7 +406,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
assert(DstReg != MI.getOperand(3).getReg());
bool UseRev = false;
- unsigned PredIdx, DOPIdx, SrcIdx;
+ unsigned PredIdx, DOPIdx, SrcIdx, Src2Idx;
switch (DType) {
case AArch64::DestructiveBinaryComm:
case AArch64::DestructiveBinaryCommWithRev:
@@ -420,7 +420,19 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
case AArch64::DestructiveBinary:
case AArch64::DestructiveBinaryImm:
std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3);
- break;
+ break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 2, 3, 4);
+ if (DstReg == MI.getOperand(3).getReg()) {
+ // FMLA Zd, Pg, Za, Zd, Zm ==> FMAD Zdn, Pg, Zm, Za
+ std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 3, 4, 2);
+ UseRev = true;
+ } else if (DstReg == MI.getOperand(4).getReg()) {
+ // FMLA Zd, Pg, Za, Zm, Zd ==> FMAD Zdn, Pg, Zm, Za
+ std::tie(PredIdx, DOPIdx, SrcIdx, Src2Idx) = std::make_tuple(1, 4, 3, 2);
+ UseRev = true;
+ }
+ break;
default:
llvm_unreachable("Unsupported Destructive Operand type");
}
@@ -440,6 +452,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
case AArch64::DestructiveBinaryImm:
DOPRegIsUnique = true;
break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ DOPRegIsUnique =
+ DstReg != MI.getOperand(DOPIdx).getReg() ||
+ (MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg() &&
+ MI.getOperand(DOPIdx).getReg() != MI.getOperand(Src2Idx).getReg());
+ break;
}
#endif
@@ -522,6 +540,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
.add(MI.getOperand(SrcIdx));
break;
+ case AArch64::DestructiveTernaryCommWithRev:
+ DOP.add(MI.getOperand(PredIdx))
+ .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .add(MI.getOperand(SrcIdx))
+ .add(MI.getOperand(Src2Idx));
+ break;
}
if (PRFX) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 710c9d12a78a..624f89b780e9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11592,6 +11592,11 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
}
}
+bool AArch64TargetLowering::generateFMAsInMachineCombiner(
+ EVT VT, CodeGenOpt::Level OptLevel) const {
+ return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector();
+}
+
const MCPhysReg *
AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c9c7b6fbe8fd..8db2c232f360 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -595,6 +595,9 @@ class AArch64TargetLowering : public TargetLowering {
EVT VT) const override;
bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override;
+ bool generateFMAsInMachineCombiner(EVT VT,
+ CodeGenOpt::Level OptLevel) const override;
+
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
/// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 95ea9297e7ac..1f11d82ff78e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -458,31 +458,57 @@ let Predicates = [HasSVE] in {
defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
- defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
- defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
- defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
- defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;
-
- defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
- defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
- defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
- defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
-
- // Add patterns for FMA where disabled lanes are undef.
- // FIXME: Implement a pseudo so we can choose a better instruction after
- // regalloc.
- def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)),
- (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
- (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)),
- (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
- def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
- (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
+ defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", "FMLA_ZPZZZ", int_aarch64_sve_fmla, "FMAD_ZPmZZ">;
+ defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", "FMLS_ZPZZZ", int_aarch64_sve_fmls, "FMSB_ZPmZZ">;
+ defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", "FNMLA_ZPZZZ", int_aarch64_sve_fnmla, "FNMAD_ZPmZZ">;
+ defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", "FNMLS_ZPZZZ", int_aarch64_sve_fnmls, "FNMSB_ZPmZZ">;
+
+ defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad, "FMLA_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb, "FMLS_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad, "FNMLA_ZPmZZ", /*isReverseInstr*/ 1>;
+ defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb, "FNMLS_ZPmZZ", /*isReverseInstr*/ 1>;
+
+ defm FMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmla, int_aarch64_sve_fmad>;
+ defm FMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmls, int_aarch64_sve_fmsb>;
+ defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmla, int_aarch64_sve_fnmad>;
+ defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmls, int_aarch64_sve_fnmsb>;
+
+ multiclass fma<ValueType Ty, ValueType PredTy, string Suffix> {
+ // Zd = Za + Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)),
+ (!cast<Instruction>("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = Za + -Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)),
+ (!cast<Instruction>("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -Za + Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))),
+ (!cast<Instruction>("FNMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -Za + -Zn * Zm
+ def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))),
+ (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zd = -(Za + Zn * Zm)
+ def : Pat<(AArch64fneg_mt PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))),
+ (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zda = Zda + Zn * Zm
+ def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), ZPR:$Zn, ZPR:$Zm, ZPR:$Za)), ZPR:$Za),
+ (!cast<Instruction>("FMLA_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+
+ // Zda = Zda + -Zn * Zm
+ def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), (AArch64fneg_mt (PredTy (AArch64ptrue 31)), Ty:$Zn, (Ty (undef))), ZPR:$Zm, ZPR:$Za)), ZPR:$Za),
+ (!cast<Instruction>("FMLS_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
+ }
+
+ defm : fma<nxv8f16, nxv8i1, "H">;
+ defm : fma<nxv4f16, nxv4i1, "H">;
+ defm : fma<nxv2f16, nxv2i1, "H">;
+ defm : fma<nxv4f32, nxv4i1, "S">;
+ defm : fma<nxv2f32, nxv2i1, "S">;
+ defm : fma<nxv2f64, nxv2i1, "D">;
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a5bc3668ed54..3eb4c04570de 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -52,10 +52,6 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
}
return SDValue();
}
-bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
- CodeGenOpt::Level OptLevel) const {
- return OptLevel >= CodeGenOpt::Aggressive;
-}
static const int kSetTagLoopThreshold = 176;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d94fd8471b7b..7d53bd456975 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -28,7 +28,6 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
SDValue Chain, SDValue Op1, SDValue Op2,
MachinePointerInfo DstPtrInfo,
bool ZeroData) const override;
- bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
};
}
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9e7ff1cde356..178c83b98599 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -491,6 +491,13 @@ let hasNoSchedulingInfo = 1 in {
Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
let FalseLanes = flags;
}
+
+ class PredThreeOpPseudo<string name, ZPRRegOp zprty,
+ FalseLanesEnum flags = FalseLanesNone>
+ : SVEPseudo2Instr<name, 0>,
+ Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> {
+ let FalseLanes = flags;
+ }
}
//===----------------------------------------------------------------------===//
@@ -1762,14 +1769,20 @@ class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
- let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
}
-multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, SDPatternOperator op> {
- def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, string Ps,
+ SDPatternOperator op, string revname,
+ bit isReverseInstr=0> {
+ let DestructiveInstType = DestructiveTernaryCommWithRev in {
+ def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+ }
def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
@@ -1801,16 +1814,26 @@ class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
let ElementSize = zprty.ElementSize;
}
-multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op> {
- def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op,
+ string revname, bit isReverseInstr> {
+ def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>,
+ SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+ def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>,
+ SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+ def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>,
+ SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve_fp_3op_p_zds_zx<SDPatternOperator op, SDPatternOperator rev_op> {
+ def _UNDEF_H : PredThreeOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+ def _UNDEF_S : PredThreeOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+ def _UNDEF_D : PredThreeOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Floating Point Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
index 407b52714e64..fdd0acd97024 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
@@ -572,8 +572,8 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x half>, <16 x half>* %a
%op2 = load <16 x half>, <16 x half>* %b
@@ -589,8 +589,8 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
@@ -606,8 +606,8 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
@@ -623,8 +623,8 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-DAG: ld1h { [[OP3:z[0-9]+]].h }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
-; CHECK: st1h { [[OP3]].h }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].h, [[PG]]/m, [[OP2]].h, [[OP3]].h
+; CHECK: st1h { [[OP1]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
@@ -658,8 +658,8 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <8 x float>, <8 x float>* %a
%op2 = load <8 x float>, <8 x float>* %b
@@ -675,8 +675,8 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
@@ -692,8 +692,8 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
@@ -709,8 +709,8 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-DAG: ld1w { [[OP3:z[0-9]+]].s }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
-; CHECK: st1w { [[OP3]].s }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].s, [[PG]]/m, [[OP2]].s, [[OP3]].s
+; CHECK: st1w { [[OP1]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
@@ -744,8 +744,8 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <4 x double>, <4 x double>* %a
%op2 = load <4 x double>, <4 x double>* %b
@@ -761,8 +761,8 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
@@ -778,8 +778,8 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
@@ -795,8 +795,8 @@ define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c)
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-DAG: ld1d { [[OP3:z[0-9]+]].d }, [[PG]]/z, [x2]
-; CHECK: fmla [[OP3]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
-; CHECK: st1d { [[OP3]].d }, [[PG]], [x0]
+; CHECK: fmad [[OP1]].d, [[PG]]/m, [[OP2]].d, [[OP3]].d
+; CHECK: st1d { [[OP1]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
new file mode 100644
index 000000000000..6fcf45d9286c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
@@ -0,0 +1,746 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 8 x half> @fmla_h_sel(<vscale x 8 x i1> %pred, <vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmla_h_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %add = fadd fast <vscale x 8 x half> %acc, %mul
+ %res = select <vscale x 8 x i1> %pred, <vscale x 8 x half> %add, <vscale x 8 x half> %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmla_hx4_sel(<vscale x 4 x i1> %pred, <vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmla_hx4_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %add = fadd fast <vscale x 4 x half> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x half> %add, <vscale x 4 x half> %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmla_hx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmla_hx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %add = fadd fast <vscale x 2 x half> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x half> %add, <vscale x 2 x half> %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmla_s_sel(<vscale x 4 x i1> %pred, <vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmla_s_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %add = fadd fast <vscale x 4 x float> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x float> %add, <vscale x 4 x float> %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmla_sx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmla_sx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %add = fadd fast <vscale x 2 x float> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x float> %add, <vscale x 2 x float> %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmla_d_sel(<vscale x 2 x i1> %pred, <vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmla_d_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %add = fadd fast <vscale x 2 x double> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x double> %add, <vscale x 2 x double> %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmls_h_sel(<vscale x 8 x i1> %pred, <vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmls_h_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %sub = fsub fast <vscale x 8 x half> %acc, %mul
+ %res = select <vscale x 8 x i1> %pred, <vscale x 8 x half> %sub, <vscale x 8 x half> %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmls_hx4_sel(<vscale x 4 x i1> %pred, <vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmls_hx4_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %sub = fsub fast <vscale x 4 x half> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x half> %sub, <vscale x 4 x half> %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmls_hx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmls_hx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %sub = fsub fast <vscale x 2 x half> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x half> %sub, <vscale x 2 x half> %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmls_s_sel(<vscale x 4 x i1> %pred, <vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmls_s_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %sub = fsub fast <vscale x 4 x float> %acc, %mul
+ %res = select <vscale x 4 x i1> %pred, <vscale x 4 x float> %sub, <vscale x 4 x float> %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmls_sx2_sel(<vscale x 2 x i1> %pred, <vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmls_sx2_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %sub = fsub fast <vscale x 2 x float> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x float> %sub, <vscale x 2 x float> %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmls_d_sel(<vscale x 2 x i1> %pred, <vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmls_d_sel:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %sub = fsub fast <vscale x 2 x double> %acc, %mul
+ %res = select <vscale x 2 x i1> %pred, <vscale x 2 x double> %sub, <vscale x 2 x double> %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmad_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fmad_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fadd fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmad_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fmad_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fadd fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmad_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fmad_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fadd fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmad_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fmad_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fadd fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmad_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fmad_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fadd fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmad_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fmad_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fadd fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmla_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmla_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fadd fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmla_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmla_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fadd fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmla_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmla_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fadd fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmla_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmla_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fadd fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmla_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmla_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fadd fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmla_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmla_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fadd fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmls_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fmls_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmls_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fmls_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmls_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fmls_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmls_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fmls_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmls_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fmls_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmls_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fmls_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fmsb_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fmsb_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %acc, %mul
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fmsb_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fmsb_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %acc, %mul
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fmsb_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fmsb_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %acc, %mul
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fmsb_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fmsb_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %acc, %mul
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fmsb_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fmsb_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %acc, %mul
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fmsb_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fmsb_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmsb z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %acc, %mul
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmad_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fnmad_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 8 x half> %m1
+ %mul = fmul fast <vscale x 8 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmad_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fnmad_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x half> %m1
+ %mul = fmul fast <vscale x 4 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmad_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fnmad_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmad z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x half> %m1
+ %mul = fmul fast <vscale x 2 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmad_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fnmad_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x float> %m1
+ %mul = fmul fast <vscale x 4 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmad_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fnmad_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x float> %m1
+ %mul = fmul fast <vscale x 2 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmad_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fnmad_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x double> %m1
+ %mul = fmul fast <vscale x 2 x double> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmla_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fnmla_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 8 x half> %m1
+ %mul = fmul fast <vscale x 8 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmla_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fnmla_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x half> %m1
+ %mul = fmul fast <vscale x 4 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmla_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fnmla_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x half> %m1
+ %mul = fmul fast <vscale x 2 x half> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmla_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fnmla_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 4 x float> %m1
+ %mul = fmul fast <vscale x 4 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmla_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fnmla_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x float> %m1
+ %mul = fmul fast <vscale x 2 x float> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmla_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fnmla_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %neg_m1 = fneg fast <vscale x 2 x double> %m1
+ %mul = fmul fast <vscale x 2 x double> %neg_m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmla_h_reversed(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fnmla_h_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %add = fadd fast <vscale x 8 x half> %mul, %acc
+ %res = fneg fast <vscale x 8 x half> %add
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmla_hx4_reversed(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fnmla_hx4_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %add = fadd fast <vscale x 4 x half> %mul, %acc
+ %res = fneg fast <vscale x 4 x half> %add
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmla_hx2_reversed(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fnmla_hx2_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %add = fadd fast <vscale x 2 x half> %mul, %acc
+ %res = fneg fast <vscale x 2 x half> %add
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmla_s_reversed(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fnmla_s_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %add = fadd fast <vscale x 4 x float> %mul, %acc
+ %res = fneg fast <vscale x 4 x float> %add
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmla_sx2_reversed(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fnmla_sx2_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %add = fadd fast <vscale x 2 x float> %mul, %acc
+ %res = fneg fast <vscale x 2 x float> %add
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmla_d_reversed(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fnmla_d_reversed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %add = fadd fast <vscale x 2 x double> %mul, %acc
+ %res = fneg fast <vscale x 2 x double> %add
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmls_h(<vscale x 8 x half> %acc, <vscale x 8 x half> %m1, <vscale x 8 x half> %m2) {
+; CHECK-LABEL: fnmls_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmls_hx4(<vscale x 4 x half> %acc, <vscale x 4 x half> %m1, <vscale x 4 x half> %m2) {
+; CHECK-LABEL: fnmls_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmls_hx2(<vscale x 2 x half> %acc, <vscale x 2 x half> %m1, <vscale x 2 x half> %m2) {
+; CHECK-LABEL: fnmls_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmls z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmls_s(<vscale x 4 x float> %acc, <vscale x 4 x float> %m1, <vscale x 4 x float> %m2) {
+; CHECK-LABEL: fnmls_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmls_sx2(<vscale x 2 x float> %acc, <vscale x 2 x float> %m1, <vscale x 2 x float> %m2) {
+; CHECK-LABEL: fnmls_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmls z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmls_d(<vscale x 2 x double> %acc, <vscale x 2 x double> %m1, <vscale x 2 x double> %m2) {
+; CHECK-LABEL: fnmls_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 8 x half> @fnmsb_h(<vscale x 8 x half> %m1, <vscale x 8 x half> %m2, <vscale x 8 x half> %acc) {
+; CHECK-LABEL: fnmsb_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 8 x half> %m1, %m2
+ %res = fsub fast <vscale x 8 x half> %mul, %acc
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fnmsb_hx4(<vscale x 4 x half> %m1, <vscale x 4 x half> %m2, <vscale x 4 x half> %acc) {
+; CHECK-LABEL: fnmsb_hx4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x half> %m1, %m2
+ %res = fsub fast <vscale x 4 x half> %mul, %acc
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fnmsb_hx2(<vscale x 2 x half> %m1, <vscale x 2 x half> %m2, <vscale x 2 x half> %acc) {
+; CHECK-LABEL: fnmsb_hx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x half> %m1, %m2
+ %res = fsub fast <vscale x 2 x half> %mul, %acc
+ ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x float> @fnmsb_s(<vscale x 4 x float> %m1, <vscale x 4 x float> %m2, <vscale x 4 x float> %acc) {
+; CHECK-LABEL: fnmsb_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 4 x float> %m1, %m2
+ %res = fsub fast <vscale x 4 x float> %mul, %acc
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x float> @fnmsb_sx2(<vscale x 2 x float> %m1, <vscale x 2 x float> %m2, <vscale x 2 x float> %acc) {
+; CHECK-LABEL: fnmsb_sx2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmsb z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x float> %m1, %m2
+ %res = fsub fast <vscale x 2 x float> %mul, %acc
+ ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x double> @fnmsb_d(<vscale x 2 x double> %m1, <vscale x 2 x double> %m2, <vscale x 2 x double> %acc) {
+; CHECK-LABEL: fnmsb_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+ %mul = fmul fast <vscale x 2 x double> %m1, %m2
+ %res = fsub fast <vscale x 2 x double> %mul, %acc
+ ret <vscale x 2 x double> %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll
index 2c1f8df086b6..b441cb7ee0f8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp.ll
@@ -240,8 +240,7 @@ define <vscale x 8 x half> @fma_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x ha
; CHECK-LABEL: fma_nxv8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: fmla z2.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%r = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
ret <vscale x 8 x half> %r
@@ -251,8 +250,7 @@ define <vscale x 4 x half> @fma_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x ha
; CHECK-LABEL: fma_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: fmla z2.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%r = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, <vscale x 4 x half> %c)
ret <vscale x 4 x half> %r
@@ -262,8 +260,7 @@ define <vscale x 2 x half> @fma_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x ha
; CHECK-LABEL: fma_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmla z2.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
%r = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, <vscale x 2 x half> %c)
ret <vscale x 2 x half> %r
@@ -273,8 +270,7 @@ define <vscale x 4 x float> @fma_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x
; CHECK-LABEL: fma_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: fmla z2.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
%r = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
ret <vscale x 4 x float> %r
@@ -284,8 +280,7 @@ define <vscale x 2 x float> @fma_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x
; CHECK-LABEL: fma_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmla z2.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
%r = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, <vscale x 2 x float> %c)
ret <vscale x 2 x float> %r
@@ -295,8 +290,7 @@ define <vscale x 2 x double> @fma_nxv2f64_1(<vscale x 2 x double> %a, <vscale x
; CHECK-LABEL: fma_nxv2f64_1:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmla z2.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
%r = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
ret <vscale x 2 x double> %r
@@ -306,8 +300,7 @@ define <vscale x 2 x double> @fma_nxv2f64_2(<vscale x 2 x double> %a, <vscale x
; CHECK-LABEL: fma_nxv2f64_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmla z2.d, p0/m, z1.d, z0.d
-; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
%r = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> %b, <vscale x 2 x double> %a, <vscale x 2 x double> %c)
ret <vscale x 2 x double> %r
More information about the llvm-commits
mailing list