[llvm] r283663 - [AArch64] Avoid generating indexed vector instructions for Exynos
Sebastian Pop via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 8 05:30:09 PDT 2016
Author: spop
Date: Sat Oct 8 07:30:07 2016
New Revision: 283663
URL: http://llvm.org/viewvc/llvm-project?rev=283663&view=rev
Log:
[AArch64] Avoid generating indexed vector instructions for Exynos
Avoid generating indexed vector instructions for Exynos. This is needed for
fmla/fmls/fmul/fmulx. For example, the instruction
fmla v0.4s, v1.4s, v2.s[1]
is less efficient than the instructions
dup v2.4s, v2.s[1]
fmla v0.4s, v1.4s, v2.4s
Patch written by Abderrazek Zaafrani.
Differential Revision: https://reviews.llvm.org/D21571
Added:
llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
Modified:
llvm/trunk/lib/Target/AArch64/AArch64.h
llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
llvm/trunk/lib/Target/AArch64/CMakeLists.txt
llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.h?rev=283663&r1=283662&r2=283663&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64.h Sat Oct 8 07:30:07 2016
@@ -35,6 +35,7 @@ FunctionPass *createAArch64ISelDag(AArch
FunctionPass *createAArch64StorePairSuppressPass();
FunctionPass *createAArch64ExpandPseudoPass();
FunctionPass *createAArch64LoadStoreOptimizationPass();
+FunctionPass *createAArch64VectorByElementOptPass();
ModulePass *createAArch64PromoteConstantPass();
FunctionPass *createAArch64ConditionOptimizerPass();
FunctionPass *createAArch64AddressTypePromotionPass();
@@ -55,6 +56,7 @@ void initializeAArch64ConditionOptimizer
void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
void initializeAArch64ExpandPseudoPass(PassRegistry&);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64VectorByElementOptPass(PassRegistry&);
void initializeAArch64PromoteConstantPass(PassRegistry&);
void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
void initializeAArch64StorePairSuppressPass(PassRegistry&);
Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=283663&r1=283662&r2=283663&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp Sat Oct 8 07:30:07 2016
@@ -141,6 +141,7 @@ extern "C" void LLVMInitializeAArch64Tar
initializeAArch64DeadRegisterDefinitionsPass(*PR);
initializeAArch64ExpandPseudoPass(*PR);
initializeAArch64LoadStoreOptPass(*PR);
+ initializeAArch64VectorByElementOptPass(*PR);
initializeAArch64PromoteConstantPass(*PR);
initializeAArch64RedundantCopyEliminationPass(*PR);
initializeAArch64StorePairSuppressPass(*PR);
@@ -422,6 +423,7 @@ bool AArch64PassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);
if (EnableStPairSuppress)
addPass(createAArch64StorePairSuppressPass());
+ addPass(createAArch64VectorByElementOptPass());
return true;
}
Added: llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp?rev=283663&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp (added)
+++ llvm/trunk/lib/Target/AArch64/AArch64VectorByElementOpt.cpp Sat Oct 8 07:30:07 2016
@@ -0,0 +1,371 @@
+//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs optimization for vector by element
+// SIMD instructions.
+//
+// Certain SIMD instructions with vector element operand are not efficient.
+// Rewrite them into SIMD instructions with vector operands. This rewrite
+// is driven by the latency of the instructions.
+//
+// Example:
+// fmla v0.4s, v1.4s, v2.s[1]
+// is rewritten into
+// dup v3.4s, v2.s[1]
+// fmla v0.4s, v1.4s, v3.4s
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
+
+STATISTIC(NumModifiedInstr,
+ "Number of vector by element instructions modified");
+
+#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
+ "AArch64 vector by element instruction optimization pass"
+
+namespace {
+
+struct AArch64VectorByElementOpt : public MachineFunctionPass {
+ static char ID;
+ AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+ initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+ }
+
+ const TargetInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ TargetSchedModel SchedModel;
+
+ /// Based only on latency of instructions, determine if it is cost efficient
+ /// to replace the instruction InstDesc by the two instructions InstDescRep1
+ /// and InstDescRep2.
+ /// Return true if replacement is recommended.
+ bool
+ shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
+ const MCInstrDesc *InstDescRep1,
+ const MCInstrDesc *InstDescRep2,
+ std::map<unsigned, bool> &VecInstElemTable) const;
+
+ /// Determine if we need to exit the vector by element instruction
+ /// optimization pass early. This makes sure that Targets with no need
+ /// for this optimization do not spent any compile time on this pass.
+ /// This check is done by comparing the latency of an indexed FMLA
+ /// instruction to the latency of the DUP + the latency of a vector
+ /// FMLA instruction. We do not check on other related instructions such
+ /// as FMLS as we assume that if the situation shows up for one
+ /// instruction, then it is likely to show up for the related ones.
+ /// Return true if early exit of the pass is recommended.
+ bool earlyExitVectElement(MachineFunction *MF);
+
+ /// Check whether an equivalent DUP instruction has already been
+ /// created or not.
+ /// Return true when the dup instruction already exists. In this case,
+ /// DestReg will point to the destination of the already created DUP.
+ bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
+ unsigned LaneNumber, unsigned *DestReg) const;
+
+ /// Certain SIMD instructions with vector element operand are not efficient.
+ /// Rewrite them into SIMD instructions with vector operands. This rewrite
+ /// is driven by the latency of the instructions.
+ /// Return true if the SIMD instruction is modified.
+ bool optimizeVectElement(MachineInstr &MI,
+ std::map<unsigned, bool> *VecInstElemTable) const;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
+ }
+};
+char AArch64VectorByElementOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
+ AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
+
+/// Based only on latency of instructions, determine if it is cost efficient
+/// to replace the instruction InstDesc by the two instructions InstDescRep1
+/// and InstDescRep2. Note that it is assumed in this fuction that an
+/// instruction of type InstDesc is always replaced by the same two
+/// instructions as results are cached here.
+/// Return true if replacement is recommended.
+bool AArch64VectorByElementOpt::shouldReplaceInstruction(
+ MachineFunction *MF, const MCInstrDesc *InstDesc,
+ const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
+ std::map<unsigned, bool> &VecInstElemTable) const {
+ // Check if replacment decision is alredy available in the cached table.
+ // if so, return it.
+ if (!VecInstElemTable.empty() &&
+ VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
+ return VecInstElemTable[InstDesc->getOpcode()];
+
+ unsigned SCIdx = InstDesc->getSchedClass();
+ unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
+ unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
+ const MCSchedClassDesc *SCDesc =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+ const MCSchedClassDesc *SCDescRep1 =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
+ const MCSchedClassDesc *SCDescRep2 =
+ SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
+
+ // If a subtarget does not define resources for any of the instructions
+ // of interest, then return false for no replacement.
+ if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() ||
+ SCDescRep1->isVariant() || !SCDescRep2->isValid() ||
+ SCDescRep2->isVariant()) {
+ VecInstElemTable[InstDesc->getOpcode()] = false;
+ return false;
+ }
+
+ if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
+ SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
+ SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
+ VecInstElemTable[InstDesc->getOpcode()] = true;
+ return true;
+ }
+ VecInstElemTable[InstDesc->getOpcode()] = false;
+ return false;
+}
+
+/// Determine if we need to exit the vector by element instruction
+/// optimization pass early. This makes sure that Targets with no need
+/// for this optimization do not spent any compile time on this pass.
+/// This check is done by comparing the latency of an indexed FMLA
+/// instruction to the latency of the DUP + the latency of a vector
+/// FMLA instruction. We do not check on other related instructions such
+/// as FMLS as we assume that if the situation shows up for one
+/// instruction, then it is likely to show up for the related ones.
+/// Return true if early exit of the pass is recommended.
+bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
+ std::map<unsigned, bool> VecInstElemTable;
+ const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
+ const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
+
+ if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
+ VecInstElemTable))
+ return true;
+ return false;
+}
+
+/// Check whether an equivalent DUP instruction has already been
+/// created or not.
+/// Return true when the dup instruction already exists. In this case,
+/// DestReg will point to the destination of the already created DUP.
+bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
+ unsigned SrcReg, unsigned LaneNumber,
+ unsigned *DestReg) const {
+ for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
+ MII != MIE;) {
+ MII--;
+ MachineInstr *CurrentMI = &*MII;
+
+ if (CurrentMI->getOpcode() == DupOpcode &&
+ CurrentMI->getNumOperands() == 3 &&
+ CurrentMI->getOperand(1).getReg() == SrcReg &&
+ CurrentMI->getOperand(2).getImm() == LaneNumber) {
+ *DestReg = CurrentMI->getOperand(0).getReg();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Certain SIMD instructions with vector element operand are not efficient.
+/// Rewrite them into SIMD instructions with vector operands. This rewrite
+/// is driven by the latency of the instructions.
+/// The instruction of concerns are for the time being fmla, fmls, fmul,
+/// and fmulx and hence they are hardcoded.
+///
+/// Example:
+/// fmla v0.4s, v1.4s, v2.s[1]
+/// is rewritten into
+/// dup v3.4s, v2.s[1] // dup not necessary if redundant
+/// fmla v0.4s, v1.4s, v3.4s
+/// Return true if the SIMD instruction is modified.
+bool AArch64VectorByElementOpt::optimizeVectElement(
+ MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
+ const MCInstrDesc *MulMCID, *DupMCID;
+ const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
+
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+
+ // 4X32 instructions
+ case AArch64::FMLAv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMLAv4f32);
+ break;
+ case AArch64::FMLSv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMLSv4f32);
+ break;
+ case AArch64::FMULXv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMULXv4f32);
+ break;
+ case AArch64::FMULv4i32_indexed:
+ DupMCID = &TII->get(AArch64::DUPv4i32lane);
+ MulMCID = &TII->get(AArch64::FMULv4f32);
+ break;
+
+ // 2X64 instructions
+ case AArch64::FMLAv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMLAv2f64);
+ break;
+ case AArch64::FMLSv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMLSv2f64);
+ break;
+ case AArch64::FMULXv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMULXv2f64);
+ break;
+ case AArch64::FMULv2i64_indexed:
+ DupMCID = &TII->get(AArch64::DUPv2i64lane);
+ MulMCID = &TII->get(AArch64::FMULv2f64);
+ break;
+
+ // 2X32 instructions
+ case AArch64::FMLAv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMLAv2f32);
+ break;
+ case AArch64::FMLSv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMLSv2f32);
+ break;
+ case AArch64::FMULXv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMULXv2f32);
+ break;
+ case AArch64::FMULv2i32_indexed:
+ RC = &AArch64::FPR64RegClass;
+ DupMCID = &TII->get(AArch64::DUPv2i32lane);
+ MulMCID = &TII->get(AArch64::FMULv2f32);
+ break;
+ }
+
+ if (!shouldReplaceInstruction(MI.getParent()->getParent(),
+ &TII->get(MI.getOpcode()), DupMCID, MulMCID,
+ *VecInstElemTable))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ // get the operands of the current SIMD arithmetic instruction.
+ unsigned MulDest = MI.getOperand(0).getReg();
+ unsigned SrcReg0 = MI.getOperand(1).getReg();
+ unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
+ unsigned SrcReg1 = MI.getOperand(2).getReg();
+ unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
+ unsigned DupDest;
+
+ // Instructions of interest have either 4 or 5 operands.
+ if (MI.getNumOperands() == 5) {
+ unsigned SrcReg2 = MI.getOperand(3).getReg();
+ unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
+ unsigned LaneNumber = MI.getOperand(4).getImm();
+
+ // Create a new DUP instruction. Note that if an equivalent DUP instruction
+ // has already been created before, then use that one instread of creating
+ // a new one.
+ if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
+ DupDest = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+ .addReg(SrcReg2, Src2IsKill)
+ .addImm(LaneNumber);
+ }
+ BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+ .addReg(SrcReg0, Src0IsKill)
+ .addReg(SrcReg1, Src1IsKill)
+ .addReg(DupDest, Src2IsKill);
+ } else if (MI.getNumOperands() == 4) {
+ unsigned LaneNumber = MI.getOperand(3).getImm();
+ if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
+ DupDest = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+ .addReg(SrcReg1, Src1IsKill)
+ .addImm(LaneNumber);
+ }
+ BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+ .addReg(SrcReg0, Src0IsKill)
+ .addReg(DupDest, Src1IsKill);
+ } else {
+ return false;
+ }
+
+ ++NumModifiedInstr;
+ return true;
+}
+
+bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ TII = MF.getSubtarget().getInstrInfo();
+ MRI = &MF.getRegInfo();
+ const TargetSubtargetInfo &ST = MF.getSubtarget();
+ const AArch64InstrInfo *AAII =
+ static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ if (!AAII)
+ return false;
+ SchedModel.init(ST.getSchedModel(), &ST, AAII);
+ if (!SchedModel.hasInstrSchedModel())
+ return false;
+
+ // A simple check to exit this pass early for targets that do not need it.
+ if (earlyExitVectElement(&MF))
+ return false;
+
+ bool Changed = false;
+ std::map<unsigned, bool> VecInstElemTable;
+ SmallVector<MachineInstr *, 8> RemoveMIs;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII;
+ if (optimizeVectElement(MI, &VecInstElemTable)) {
+ // Add MI to the list of instructions to be removed given that it has
+ // been replaced.
+ RemoveMIs.push_back(&MI);
+ Changed = true;
+ }
+ ++MII;
+ }
+ }
+
+ for (MachineInstr *MI : RemoveMIs)
+ MI->eraseFromParent();
+
+ return Changed;
+}
+
+/// createAArch64VectorByElementOptPass - returns an instance of the
+/// vector by element optimization pass.
+FunctionPass *llvm::createAArch64VectorByElementOptPass() {
+ return new AArch64VectorByElementOpt();
+}
Modified: llvm/trunk/lib/Target/AArch64/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/CMakeLists.txt?rev=283663&r1=283662&r2=283663&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AArch64/CMakeLists.txt Sat Oct 8 07:30:07 2016
@@ -62,6 +62,7 @@ add_llvm_target(AArch64CodeGen
AArch64TargetMachine.cpp
AArch64TargetObjectFile.cpp
AArch64TargetTransformInfo.cpp
+ AArch64VectorByElementOpt.cpp
${GLOBAL_ISEL_BUILD_FILES}
)
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll?rev=283663&r1=283662&r2=283663&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-2velem.ll Sat Oct 8 07:30:07 2016
@@ -1,4 +1,6 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s
+; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check.
declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
@@ -382,6 +384,10 @@ define <2 x float> @test_vfma_lane_f32(<
; CHECK-LABEL: test_vfma_lane_f32:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -394,6 +400,10 @@ define <4 x float> @test_vfmaq_lane_f32(
; CHECK-LABEL: test_vfmaq_lane_f32:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -406,6 +416,10 @@ define <2 x float> @test_vfma_laneq_f32(
; CHECK-LABEL: test_vfma_laneq_f32:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -416,6 +430,10 @@ define <4 x float> @test_vfmaq_laneq_f32
; CHECK-LABEL: test_vfmaq_laneq_f32:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -426,6 +444,10 @@ define <2 x float> @test_vfms_lane_f32(<
; CHECK-LABEL: test_vfms_lane_f32:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -437,6 +459,10 @@ define <4 x float> @test_vfmsq_lane_f32(
; CHECK-LABEL: test_vfmsq_lane_f32:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -448,6 +474,10 @@ define <2 x float> @test_vfms_laneq_f32(
; CHECK-LABEL: test_vfms_laneq_f32:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -459,6 +489,10 @@ define <4 x float> @test_vfmsq_laneq_f32
; CHECK-LABEL: test_vfmsq_laneq_f32:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -470,6 +504,10 @@ define <2 x double> @test_vfmaq_lane_f64
; CHECK-LABEL: test_vfmaq_lane_f64:
; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_lane_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -482,6 +520,10 @@ define <2 x double> @test_vfmaq_laneq_f6
; CHECK-LABEL: test_vfmaq_laneq_f64:
; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -492,6 +534,10 @@ define <2 x double> @test_vfmsq_lane_f64
; CHECK-LABEL: test_vfmsq_lane_f64:
; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_lane_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <1 x double> <double -0.000000e+00>, %v
%lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
@@ -503,6 +549,10 @@ define <2 x double> @test_vfmsq_laneq_f6
; CHECK-LABEL: test_vfmsq_laneq_f64:
; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
%lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -514,6 +564,9 @@ define float @test_vfmas_laneq_f32(float
; CHECK-LABEL: test_vfmas_laneq_f32
; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXNOS-LABEL: test_vfmas_laneq_f32
+; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; EXNOS-NEXT: ret
entry:
%extract = extractelement <4 x float> %v, i32 3
%0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
@@ -539,6 +592,9 @@ define float @test_vfmss_lane_f32(float
; CHECK-LABEL: test_vfmss_lane_f32
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmss_lane_f32
+; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; EXYNOS-NEXT: ret
entry:
%extract.rhs = extractelement <2 x float> %v, i32 1
%extract = fsub float -0.000000e+00, %extract.rhs
@@ -561,6 +617,9 @@ define double @test_vfmsd_laneq_f64(doub
; CHECK-LABEL: test_vfmsd_laneq_f64
; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsd_laneq_f64
+; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; EXYNOS-NEXT: ret
entry:
%extract.rhs = extractelement <2 x double> %v, i32 1
%extract = fsub double -0.000000e+00, %extract.rhs
@@ -583,6 +642,9 @@ define float @test_vfmss_lane_f32_0(floa
; CHECK-LABEL: test_vfmss_lane_f32_0
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmss_lane_f32_0
+; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; EXYNOS-NEXT: ret
entry:
%tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%tmp1 = extractelement <2 x float> %tmp0, i32 1
@@ -1408,6 +1470,10 @@ define <2 x float> @test_vmul_lane_f32(<
; CHECK-LABEL: test_vmul_lane_f32:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%mul = fmul <2 x float> %shuffle, %a
@@ -1418,6 +1484,9 @@ define <1 x double> @test_vmul_lane_f64(
; CHECK-LABEL: test_vmul_lane_f64:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_lane_f64:
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; EXYNOS-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
%1 = bitcast <8 x i8> %0 to double
@@ -1431,6 +1500,10 @@ define <4 x float> @test_vmulq_lane_f32(
; CHECK-LABEL: test_vmulq_lane_f32:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul = fmul <4 x float> %shuffle, %a
@@ -1441,6 +1514,10 @@ define <2 x double> @test_vmulq_lane_f64
; CHECK-LABEL: test_vmulq_lane_f64:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_lane_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x double> %shuffle, %a
@@ -1451,6 +1528,10 @@ define <2 x float> @test_vmul_laneq_f32(
; CHECK-LABEL: test_vmul_laneq_f32:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%mul = fmul <2 x float> %shuffle, %a
@@ -1461,6 +1542,9 @@ define <1 x double> @test_vmul_laneq_f64
; CHECK-LABEL: test_vmul_laneq_f64:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f64:
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; EXYNOS-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
%1 = bitcast <8 x i8> %0 to double
@@ -1474,6 +1558,10 @@ define <4 x float> @test_vmulq_laneq_f32
; CHECK-LABEL: test_vmulq_laneq_f32:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul = fmul <4 x float> %shuffle, %a
@@ -1484,6 +1572,10 @@ define <2 x double> @test_vmulq_laneq_f6
; CHECK-LABEL: test_vmulq_laneq_f64:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%mul = fmul <2 x double> %shuffle, %a
@@ -1494,6 +1586,10 @@ define <2 x float> @test_vmulx_lane_f32(
; CHECK-LABEL: test_vmulx_lane_f32:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1504,6 +1600,10 @@ define <4 x float> @test_vmulxq_lane_f32
; CHECK-LABEL: test_vmulxq_lane_f32:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; Exynos-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1514,6 +1614,10 @@ define <2 x double> @test_vmulxq_lane_f6
; CHECK-LABEL: test_vmulxq_lane_f64:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1524,6 +1628,10 @@ define <2 x float> @test_vmulx_laneq_f32
; CHECK-LABEL: test_vmulx_laneq_f32:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1534,6 +1642,10 @@ define <4 x float> @test_vmulxq_laneq_f3
; CHECK-LABEL: test_vmulxq_laneq_f32:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f32:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1544,6 +1656,10 @@ define <2 x double> @test_vmulxq_laneq_f
; CHECK-LABEL: test_vmulxq_laneq_f64:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f64:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1890,6 +2006,10 @@ define <2 x float> @test_vfma_lane_f32_0
; CHECK-LABEL: test_vfma_lane_f32_0:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1900,6 +2020,10 @@ define <4 x float> @test_vfmaq_lane_f32_
; CHECK-LABEL: test_vfmaq_lane_f32_0:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1910,6 +2034,10 @@ define <2 x float> @test_vfma_laneq_f32_
; CHECK-LABEL: test_vfma_laneq_f32_0:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfma_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1920,6 +2048,10 @@ define <4 x float> @test_vfmaq_laneq_f32
; CHECK-LABEL: test_vfmaq_laneq_f32_0:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
%0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1930,6 +2062,10 @@ define <2 x float> @test_vfms_lane_f32_0
; CHECK-LABEL: test_vfms_lane_f32_0:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
@@ -1941,6 +2077,10 @@ define <4 x float> @test_vfmsq_lane_f32_
; CHECK-LABEL: test_vfmsq_lane_f32_0:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
@@ -1952,6 +2092,10 @@ define <2 x float> @test_vfms_laneq_f32_
; CHECK-LABEL: test_vfms_laneq_f32_0:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfms_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
@@ -1963,6 +2107,10 @@ define <4 x float> @test_vfmsq_laneq_f32
; CHECK-LABEL: test_vfmsq_laneq_f32_0:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
%lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
@@ -1974,6 +2122,10 @@ define <2 x double> @test_vfmaq_laneq_f6
; CHECK-LABEL: test_vfmaq_laneq_f64_0:
; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmaq_laneq_f64_0:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
%0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -1984,6 +2136,10 @@ define <2 x double> @test_vfmsq_laneq_f6
; CHECK-LABEL: test_vfmsq_laneq_f64_0:
; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vfmsq_laneq_f64_0:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
%lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
@@ -2787,6 +2943,10 @@ define <2 x float> @test_vmul_lane_f32_0
; CHECK-LABEL: test_vmul_lane_f32_0:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x float> %shuffle, %a
@@ -2797,6 +2957,10 @@ define <4 x float> @test_vmulq_lane_f32_
; CHECK-LABEL: test_vmulq_lane_f32_0:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
%mul = fmul <4 x float> %shuffle, %a
@@ -2807,6 +2971,10 @@ define <2 x float> @test_vmul_laneq_f32_
; CHECK-LABEL: test_vmul_laneq_f32_0:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x float> %shuffle, %a
@@ -2817,6 +2985,9 @@ define <1 x double> @test_vmul_laneq_f64
; CHECK-LABEL: test_vmul_laneq_f64_0:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmul_laneq_f64_0:
+; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; EXYNOS-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
%1 = bitcast <8 x i8> %0 to double
@@ -2830,6 +3001,10 @@ define <4 x float> @test_vmulq_laneq_f32
; CHECK-LABEL: test_vmulq_laneq_f32_0:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
%mul = fmul <4 x float> %shuffle, %a
@@ -2840,6 +3015,10 @@ define <2 x double> @test_vmulq_laneq_f6
; CHECK-LABEL: test_vmulq_laneq_f64_0:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulq_laneq_f64_0:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
%mul = fmul <2 x double> %shuffle, %a
@@ -2850,6 +3029,10 @@ define <2 x float> @test_vmulx_lane_f32_
; CHECK-LABEL: test_vmulx_lane_f32_0:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2860,6 +3043,10 @@ define <4 x float> @test_vmulxq_lane_f32
; CHECK-LABEL: test_vmulxq_lane_f32_0:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2870,6 +3057,10 @@ define <2 x double> @test_vmulxq_lane_f6
; CHECK-LABEL: test_vmulxq_lane_f64_0:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_lane_f64_0:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -2880,6 +3071,10 @@ define <2 x float> @test_vmulx_laneq_f32
; CHECK-LABEL: test_vmulx_laneq_f32_0:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulx_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2890,6 +3085,10 @@ define <4 x float> @test_vmulxq_laneq_f3
; CHECK-LABEL: test_vmulxq_laneq_f32_0:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f32_0:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
%vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2900,9 +3099,51 @@ define <2 x double> @test_vmulxq_laneq_f
; CHECK-LABEL: test_vmulxq_laneq_f64_0:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
+; EXYNOS-LABEL: test_vmulxq_laneq_f64_0:
+; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
+; EXYNOS-NEXT: ret
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
%vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
ret <2 x double> %vmulx2.i
}
+define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
+; CHECK-LABEL: optimize_dup:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+; EXYNOS-LABEL: optimize_dup:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS-NEXT: ret
+entry:
+ %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
+ %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %1 = fmul <4 x float> %lane2, %c
+ %s = fsub <4 x float> %0, %1
+ ret <4 x float> %s
+}
+
+define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
+; CHECK-LABEL: no_optimize_dup:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+; EXYNOS-LABEL: no_optimize_dup:
+; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
+; EXYNOS: dup [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s
+; EXYNOS-NEXT: ret
+entry:
+ %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
+ %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %1 = fmul <4 x float> %lane2, %c
+ %s = fsub <4 x float> %0, %1
+ ret <4 x float> %s
+}
More information about the llvm-commits
mailing list